From b72f1119c654a2e3aeb41175ff94299a4ec01afb Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 25 Mar 2017 16:35:29 +0800 Subject: [PATCH 001/546] netfilter: nf_ct_ext: fix possible panic after nf_ct_extend_unregister commit 9c3f3794926a997b1cab6c42480ff300efa2d162 upstream. If one cpu is doing nf_ct_extend_unregister while another cpu is doing __nf_ct_ext_add_length, then we may hit BUG_ON(t == NULL). Moreover, there's no synchronize_rcu invocation after set nf_ct_ext_types[id] to NULL, so it's possible that we may access invalid pointer. But actually, most of the ct extends are built-in, so the problem listed above will not happen. However, there are two exceptions: NF_CT_EXT_NAT and NF_CT_EXT_SYNPROXY. For _EXT_NAT, the panic will not happen, since adding the nat extend and unregistering the nat extend are located in the same file(nf_nat_core.c), this means that after the nat module is removed, we cannot add the nat extend too. For _EXT_SYNPROXY, synproxy extend may be added by init_conntrack, while synproxy extend unregister will be done by synproxy_core_exit. So after nf_synproxy_core.ko is removed, we may still try to add the synproxy extend, then kernel panic may happen. I know it's very hard to reproduce this issue, but I can play a tricky game to make it happen very easily :) Step 1. Enable SYNPROXY for tcp dport 1234 at FORWARD hook: # iptables -I FORWARD -p tcp --dport 1234 -j SYNPROXY Step 2. Queue the syn packet to the userspace at raw table OUTPUT hook. Also note, in the userspace we only add a 20s' delay, then reinject the syn packet to the kernel: # iptables -t raw -I OUTPUT -p tcp --syn -j NFQUEUE --queue-num 1 Step 3. Using "nc 2.2.2.2 1234" to connect the server. Step 4. Now remove the nf_synproxy_core.ko quickly: # iptables -F FORWARD # rmmod ipt_SYNPROXY # rmmod nf_synproxy_core Step 5. After 20s' delay, the syn packet is reinjected to the kernel. Now you will see the panic like this: kernel BUG at net/netfilter/nf_conntrack_extend.c:91! Call Trace: ? __nf_ct_ext_add_length+0x53/0x3c0 [nf_conntrack] init_conntrack+0x12b/0x600 [nf_conntrack] nf_conntrack_in+0x4cc/0x580 [nf_conntrack] ipv4_conntrack_local+0x48/0x50 [nf_conntrack_ipv4] nf_reinject+0x104/0x270 nfqnl_recv_verdict+0x3e1/0x5f9 [nfnetlink_queue] ? nfqnl_recv_verdict+0x5/0x5f9 [nfnetlink_queue] ? nla_parse+0xa0/0x100 nfnetlink_rcv_msg+0x175/0x6a9 [nfnetlink] [...] One possible solution is to make NF_CT_EXT_SYNPROXY extend built-in, i.e. introduce nf_conntrack_synproxy.c and only do ct extend register and unregister in it, similar to nf_conntrack_timeout.c. But having such a obscure restriction of nf_ct_extend_unregister is not a good idea, so we should invoke synchronize_rcu after set nf_ct_ext_types to NULL, and check the NULL pointer when do __nf_ct_ext_add_length. Then it will be easier if we add new ct extend in the future. Last, we use kfree_rcu to free nf_ct_ext, so rcu_barrier() is unnecessary anymore, remove it too. Signed-off-by: Liping Zhang Acked-by: Florian Westphal Signed-off-by: Pablo Neira Ayuso Cc: Stefan Bader Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nf_conntrack_extend.c | 13 ++++++++++--- 1 file changed, 10 insertions(+), 3 deletions(-) diff --git a/net/netfilter/nf_conntrack_extend.c b/net/netfilter/nf_conntrack_extend.c index 1a9545965c0d..531ca55f1af6 100644 --- a/net/netfilter/nf_conntrack_extend.c +++ b/net/netfilter/nf_conntrack_extend.c @@ -53,7 +53,11 @@ nf_ct_ext_create(struct nf_ct_ext **ext, enum nf_ct_ext_id id, rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[id]); - BUG_ON(t == NULL); + if (!t) { + rcu_read_unlock(); + return NULL; + } + off = ALIGN(sizeof(struct nf_ct_ext), t->align); len = off + t->len + var_alloc_len; alloc_size = t->alloc_size + var_alloc_len; @@ -88,7 +92,10 @@ void *__nf_ct_ext_add_length(struct nf_conn *ct, enum nf_ct_ext_id id, rcu_read_lock(); t = rcu_dereference(nf_ct_ext_types[id]); - BUG_ON(t == NULL); + if (!t) { + rcu_read_unlock(); + return NULL; + } newoff = ALIGN(old->len, t->align); newlen = newoff + t->len + var_alloc_len; @@ -186,6 +193,6 @@ void nf_ct_extend_unregister(struct nf_ct_ext_type *type) RCU_INIT_POINTER(nf_ct_ext_types[type->id], NULL); update_alloc_size(type); mutex_unlock(&nf_ct_ext_type_mutex); - rcu_barrier(); /* Wait for completion of call_rcu()'s */ + synchronize_rcu(); } EXPORT_SYMBOL_GPL(nf_ct_extend_unregister); From ea088172692cbadddc4efcdae2a5dfe1693e8b20 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Tue, 15 Aug 2017 13:00:36 +0200 Subject: [PATCH 002/546] audit: Fix use after free in audit_remove_watch_rule() commit d76036ab47eafa6ce52b69482e91ca3ba337d6d6 upstream. audit_remove_watch_rule() drops watch's reference to parent but then continues to work with it. That is not safe as parent can get freed once we drop our reference. The following is a trivial reproducer: mount -o loop image /mnt touch /mnt/file auditctl -w /mnt/file -p wax umount /mnt auditctl -D Grab our own reference in audit_remove_watch_rule() earlier to make sure mark does not get freed under us. Reported-by: Tony Jones Signed-off-by: Jan Kara Tested-by: Tony Jones Signed-off-by: Paul Moore Signed-off-by: Greg Kroah-Hartman --- kernel/audit_watch.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 939945a5649c..a162661c9d60 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -457,13 +457,15 @@ void audit_remove_watch_rule(struct audit_krule *krule) list_del(&krule->rlist); if (list_empty(&watch->rules)) { + /* + * audit_remove_watch() drops our reference to 'parent' which + * can get freed. Grab our own reference to be safe. + */ + audit_get_parent(parent); audit_remove_watch(watch); - - if (list_empty(&parent->watches)) { - audit_get_parent(parent); + if (list_empty(&parent->watches)) fsnotify_destroy_mark(&parent->mark, audit_watch_group); - audit_put_parent(parent); - } + audit_put_parent(parent); } } From 04f4f73ffe9383921794b833346911bd1e19b228 Mon Sep 17 00:00:00 2001 From: Thomas Bogendoerfer Date: Sat, 12 Aug 2017 23:36:47 +0200 Subject: [PATCH 003/546] parisc: pci memory bar assignment fails with 64bit kernels on dino/cujo commit 4098116039911e8870d84c975e2ec22dab65a909 upstream. For 64bit kernels the lmmio_space_offset of the host bridge window isn't set correctly on systems with dino/cujo PCI host bridges. This leads to not assigned memory bars and failing drivers, which need to use these bars. Signed-off-by: Thomas Bogendoerfer Acked-by: Helge Deller Signed-off-by: Helge Deller Signed-off-by: Greg Kroah-Hartman --- drivers/parisc/dino.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/parisc/dino.c b/drivers/parisc/dino.c index 7b0ca1551d7b..005ea632ba53 100644 --- a/drivers/parisc/dino.c +++ b/drivers/parisc/dino.c @@ -954,7 +954,7 @@ static int __init dino_probe(struct parisc_device *dev) dino_dev->hba.dev = dev; dino_dev->hba.base_addr = ioremap_nocache(hpa, 4096); - dino_dev->hba.lmmio_space_offset = 0; /* CPU addrs == bus addrs */ + dino_dev->hba.lmmio_space_offset = PCI_F_EXTEND; spin_lock_init(&dino_dev->dinosaur_pen); dino_dev->hba.iommu = ccio_get_iommu(dev); From 4362533a04680de13acd3e6f5a16c3d81502f589 Mon Sep 17 00:00:00 2001 From: "megha.dey@linux.intel.com" Date: Wed, 2 Aug 2017 13:49:09 -0700 Subject: [PATCH 004/546] crypto: x86/sha1 - Fix reads beyond the number of blocks passed commit 8861249c740fc4af9ddc5aee321eafefb960d7c6 upstream. It was reported that the sha1 AVX2 function(sha1_transform_avx2) is reading ahead beyond its intended data, and causing a crash if the next block is beyond page boundary: http://marc.info/?l=linux-crypto-vger&m=149373371023377 This patch makes sure that there is no overflow for any buffer length. It passes the tests written by Jan Stancek that revealed this problem: https://github.com/jstancek/sha1-avx2-crash I have re-enabled sha1-avx2 by reverting commit b82ce24426a4071da9529d726057e4e642948667 Fixes: b82ce24426a4 ("crypto: sha1-ssse3 - Disable avx2") Originally-by: Ilya Albrekht Tested-by: Jan Stancek Signed-off-by: Megha Dey Reported-by: Jan Stancek Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- arch/x86/crypto/sha1_avx2_x86_64_asm.S | 67 ++++++++++++++------------ arch/x86/crypto/sha1_ssse3_glue.c | 2 +- 2 files changed, 37 insertions(+), 32 deletions(-) diff --git a/arch/x86/crypto/sha1_avx2_x86_64_asm.S b/arch/x86/crypto/sha1_avx2_x86_64_asm.S index 1cd792db15ef..1eab79c9ac48 100644 --- a/arch/x86/crypto/sha1_avx2_x86_64_asm.S +++ b/arch/x86/crypto/sha1_avx2_x86_64_asm.S @@ -117,11 +117,10 @@ .set T1, REG_T1 .endm -#define K_BASE %r8 #define HASH_PTR %r9 +#define BLOCKS_CTR %r8 #define BUFFER_PTR %r10 #define BUFFER_PTR2 %r13 -#define BUFFER_END %r11 #define PRECALC_BUF %r14 #define WK_BUF %r15 @@ -205,14 +204,14 @@ * blended AVX2 and ALU instruction scheduling * 1 vector iteration per 8 rounds */ - vmovdqu ((i * 2) + PRECALC_OFFSET)(BUFFER_PTR), W_TMP + vmovdqu (i * 2)(BUFFER_PTR), W_TMP .elseif ((i & 7) == 1) - vinsertf128 $1, (((i-1) * 2)+PRECALC_OFFSET)(BUFFER_PTR2),\ + vinsertf128 $1, ((i-1) * 2)(BUFFER_PTR2),\ WY_TMP, WY_TMP .elseif ((i & 7) == 2) vpshufb YMM_SHUFB_BSWAP, WY_TMP, WY .elseif ((i & 7) == 4) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP .elseif ((i & 7) == 7) vmovdqu WY_TMP, PRECALC_WK(i&~7) @@ -255,7 +254,7 @@ vpxor WY, WY_TMP, WY_TMP .elseif ((i & 7) == 7) vpxor WY_TMP2, WY_TMP, WY - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -291,7 +290,7 @@ vpsrld $30, WY, WY vpor WY, WY_TMP, WY .elseif ((i & 7) == 7) - vpaddd K_XMM(K_BASE), WY, WY_TMP + vpaddd K_XMM + K_XMM_AR(%rip), WY, WY_TMP vmovdqu WY_TMP, PRECALC_WK(i&~7) PRECALC_ROTATE_WY @@ -446,6 +445,16 @@ .endm +/* Add constant only if (%2 > %3) condition met (uses RTA as temp) + * %1 + %2 >= %3 ? %4 : 0 + */ +.macro ADD_IF_GE a, b, c, d + mov \a, RTA + add $\d, RTA + cmp $\c, \b + cmovge RTA, \a +.endm + /* * macro implements 80 rounds of SHA-1, for multiple blocks with s/w pipelining */ @@ -463,13 +472,16 @@ lea (2*4*80+32)(%rsp), WK_BUF # Precalc WK for first 2 blocks - PRECALC_OFFSET = 0 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 2, 64 .set i, 0 .rept 160 PRECALC i .set i, i + 1 .endr - PRECALC_OFFSET = 128 + + /* Go to next block if needed */ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 3, 128 + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 xchg WK_BUF, PRECALC_BUF .align 32 @@ -479,8 +491,8 @@ _loop: * we use K_BASE value as a signal of a last block, * it is set below by: cmovae BUFFER_PTR, K_BASE */ - cmp K_BASE, BUFFER_PTR - jne _begin + test BLOCKS_CTR, BLOCKS_CTR + jnz _begin .align 32 jmp _end .align 32 @@ -512,10 +524,10 @@ _loop0: .set j, j+2 .endr - add $(2*64), BUFFER_PTR /* move to next odd-64-byte block */ - cmp BUFFER_END, BUFFER_PTR /* is current block the last one? */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ - + /* Update Counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR, BLOCKS_CTR, 4, 128 /* * rounds * 60,62,64,66,68 @@ -532,8 +544,8 @@ _loop0: UPDATE_HASH 12(HASH_PTR), D UPDATE_HASH 16(HASH_PTR), E - cmp K_BASE, BUFFER_PTR /* is current block the last one? */ - je _loop + test BLOCKS_CTR, BLOCKS_CTR + jz _loop mov TB, B @@ -575,10 +587,10 @@ _loop2: .set j, j+2 .endr - add $(2*64), BUFFER_PTR2 /* move to next even-64-byte block */ - - cmp BUFFER_END, BUFFER_PTR2 /* is current block the last one */ - cmovae K_BASE, BUFFER_PTR /* signal the last iteration smartly */ + /* update counter */ + sub $1, BLOCKS_CTR + /* Move to the next block only if needed*/ + ADD_IF_GE BUFFER_PTR2, BLOCKS_CTR, 4, 128 jmp _loop3 _loop3: @@ -641,19 +653,12 @@ _loop3: avx2_zeroupper - lea K_XMM_AR(%rip), K_BASE - + /* Setup initial values */ mov CTX, HASH_PTR mov BUF, BUFFER_PTR - lea 64(BUF), BUFFER_PTR2 - shl $6, CNT /* mul by 64 */ - add BUF, CNT - add $64, CNT - mov CNT, BUFFER_END - - cmp BUFFER_END, BUFFER_PTR2 - cmovae K_BASE, BUFFER_PTR2 + mov BUF, BUFFER_PTR2 + mov CNT, BLOCKS_CTR xmm_mov BSWAP_SHUFB_CTL(%rip), YMM_SHUFB_BSWAP diff --git a/arch/x86/crypto/sha1_ssse3_glue.c b/arch/x86/crypto/sha1_ssse3_glue.c index 7de207a11014..dd14616b7739 100644 --- a/arch/x86/crypto/sha1_ssse3_glue.c +++ b/arch/x86/crypto/sha1_ssse3_glue.c @@ -201,7 +201,7 @@ asmlinkage void sha1_transform_avx2(u32 *digest, const char *data, static bool avx2_usable(void) { - if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; From 0dbf7f7811df34f0e6af97b885619780014f81e1 Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Mon, 14 Aug 2017 20:11:26 -0700 Subject: [PATCH 005/546] Input: elan_i2c - add ELAN0608 to the ACPI table commit 1874064eed0502bd9bef7be8023757b0c4f26883 upstream. Similar to commit 722c5ac708b4f ("Input: elan_i2c - add ELAN0605 to the ACPI table"), ELAN0608 should be handled by elan_i2c. This touchpad can be found in Lenovo ideapad 320-14IKB. BugLink: https://bugs.launchpad.net/bugs/1708852 Signed-off-by: Kai-Heng Feng Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elan_i2c_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index da5458dfb1e3..d5309d0a0482 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1235,6 +1235,7 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN0100", 0 }, { "ELAN0600", 0 }, { "ELAN0605", 0 }, + { "ELAN0608", 0 }, { "ELAN1000", 0 }, { } }; From ae4743cac8d771a614da982ed51ffd396041c3ec Mon Sep 17 00:00:00 2001 From: KT Liao Date: Mon, 14 Aug 2017 20:11:59 -0700 Subject: [PATCH 006/546] Input: elan_i2c - Add antoher Lenovo ACPI ID for upcoming Lenovo NB commit 76988690402dde2880bfe06ecccf381d48ba8e1c upstream. Add 2 new IDs (ELAN0609 and ELAN060B) to the list of ACPI IDs that should be handled by the driver. Signed-off-by: KT Liao Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elan_i2c_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index d5309d0a0482..98d4e515587a 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1236,6 +1236,9 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN0600", 0 }, { "ELAN0605", 0 }, { "ELAN0608", 0 }, + { "ELAN0605", 0 }, + { "ELAN0609", 0 }, + { "ELAN060B", 0 }, { "ELAN1000", 0 }, { } }; From 735aa043bf00c809a93d01c7ef4039115f1ef590 Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Mon, 14 Aug 2017 14:46:01 -0700 Subject: [PATCH 007/546] ALSA: seq: 2nd attempt at fixing race creating a queue commit 7e1d90f60a0d501c8503e636942ca704a454d910 upstream. commit 4842e98f26dd80be3623c4714a244ba52ea096a8 ("ALSA: seq: Fix race at creating a queue") attempted to fix a race reported by syzkaller. That fix has been described as follows: " When a sequencer queue is created in snd_seq_queue_alloc(),it adds the new queue element to the public list before referencing it. Thus the queue might be deleted before the call of snd_seq_queue_use(), and it results in the use-after-free error, as spotted by syzkaller. The fix is to reference the queue object at the right time. " Even with that fix in place, syzkaller reported a use-after-free error. It specifically pointed to the last instruction "return q->queue" in snd_seq_queue_alloc(). The pointer q is being used after kfree() has been called on it. It turned out that there is still a small window where a race can happen. The window opens at snd_seq_ioctl_create_queue()->snd_seq_queue_alloc()->queue_list_add() and closes at snd_seq_ioctl_create_queue()->queueptr()->snd_use_lock_use(). Between these two calls, a different thread could delete the queue and possibly re-create a different queue in the same location in queue_list. This change prevents this situation by calling snd_use_lock_use() from snd_seq_queue_alloc() prior to calling queue_list_add(). It is then the caller's responsibility to call snd_use_lock_free(&q->use_lock). Fixes: 4842e98f26dd ("ALSA: seq: Fix race at creating a queue") Reported-by: Dmitry Vyukov Signed-off-by: Daniel Mentz Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/seq/seq_clientmgr.c | 13 ++++--------- sound/core/seq/seq_queue.c | 14 +++++++++----- sound/core/seq/seq_queue.h | 2 +- 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index c67f9c212dd1..e326c1d80416 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1530,19 +1530,14 @@ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, void __user *arg) { struct snd_seq_queue_info info; - int result; struct snd_seq_queue *q; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; - result = snd_seq_queue_alloc(client->number, info.locked, info.flags); - if (result < 0) - return result; - - q = queueptr(result); - if (q == NULL) - return -EINVAL; + q = snd_seq_queue_alloc(client->number, info.locked, info.flags); + if (IS_ERR(q)) + return PTR_ERR(q); info.queue = q->queue; info.locked = q->locked; @@ -1552,7 +1547,7 @@ static int snd_seq_ioctl_create_queue(struct snd_seq_client *client, if (! info.name[0]) snprintf(info.name, sizeof(info.name), "Queue-%d", q->queue); strlcpy(q->name, info.name, sizeof(q->name)); - queuefree(q); + snd_use_lock_free(&q->use_lock); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; diff --git a/sound/core/seq/seq_queue.c b/sound/core/seq/seq_queue.c index 450c5187eecb..79e0c5604ef8 100644 --- a/sound/core/seq/seq_queue.c +++ b/sound/core/seq/seq_queue.c @@ -184,22 +184,26 @@ void __exit snd_seq_queues_delete(void) static void queue_use(struct snd_seq_queue *queue, int client, int use); /* allocate a new queue - - * return queue index value or negative value for error + * return pointer to new queue or ERR_PTR(-errno) for error + * The new queue's use_lock is set to 1. It is the caller's responsibility to + * call snd_use_lock_free(&q->use_lock). */ -int snd_seq_queue_alloc(int client, int locked, unsigned int info_flags) +struct snd_seq_queue *snd_seq_queue_alloc(int client, int locked, unsigned int info_flags) { struct snd_seq_queue *q; q = queue_new(client, locked); if (q == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); q->info_flags = info_flags; queue_use(q, client, 1); + snd_use_lock_use(&q->use_lock); if (queue_list_add(q) < 0) { + snd_use_lock_free(&q->use_lock); queue_delete(q); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } - return q->queue; + return q; } /* delete a queue - queue must be owned by the client */ diff --git a/sound/core/seq/seq_queue.h b/sound/core/seq/seq_queue.h index 30c8111477f6..719093489a2c 100644 --- a/sound/core/seq/seq_queue.h +++ b/sound/core/seq/seq_queue.h @@ -71,7 +71,7 @@ void snd_seq_queues_delete(void); /* create new queue (constructor) */ -int snd_seq_queue_alloc(int client, int locked, unsigned int flags); +struct snd_seq_queue *snd_seq_queue_alloc(int client, int locked, unsigned int flags); /* delete queue (destructor) */ int snd_seq_queue_delete(int client, int queueid); From f600f9c43346a7f26e9b808b827adfaf5e73958b Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 14 Aug 2017 14:35:50 +0200 Subject: [PATCH 008/546] ALSA: usb-audio: Apply sample rate quirk to Sennheiser headset commit a8e800fe0f68bc28ce309914f47e432742b865ed upstream. A Senheisser headset requires the typical sample-rate quirk for avoiding spurious errors from inquiring the current sample rate like: usb 1-1: 2:1: cannot get freq at ep 0x4 usb 1-1: 3:1: cannot get freq at ep 0x83 The USB ID 1395:740a has to be added to the entries in snd_usb_get_sample_rate_quirk(). Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1052580 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/quirks.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/usb/quirks.c b/sound/usb/quirks.c index 29f38e2b4ca9..1cc20d138dae 100644 --- a/sound/usb/quirks.c +++ b/sound/usb/quirks.c @@ -1143,6 +1143,7 @@ bool snd_usb_get_sample_rate_quirk(struct snd_usb_audio *chip) case USB_ID(0x0556, 0x0014): /* Phoenix Audio TMX320VC */ case USB_ID(0x05A3, 0x9420): /* ELP HD USB Camera */ case USB_ID(0x074D, 0x3553): /* Outlaw RR2150 (Micronas UAC3553B) */ + case USB_ID(0x1395, 0x740a): /* Sennheiser DECT */ case USB_ID(0x1901, 0x0191): /* GE B850V3 CP2114 audio interface */ case USB_ID(0x1de7, 0x0013): /* Phoenix Audio MT202exe */ case USB_ID(0x1de7, 0x0014): /* Phoenix Audio TMX320 */ From 669c8ab896a2f96a1d49cfa1f5efaaf64c0f6aaa Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 16 Aug 2017 14:18:37 +0200 Subject: [PATCH 009/546] ALSA: usb-audio: Add mute TLV for playback volumes on C-Media devices commit 0f174b3525a43bd51f9397394763925e0ebe7bc7 upstream. C-Media devices (at least some models) mute the playback stream when volumes are set to the minimum value. But this isn't informed via TLV and the user-space, typically PulseAudio, gets confused as if it's still played in a low volume. This patch adds the new flag, min_mute, to struct usb_mixer_elem_info for indicating that the mixer element is with the minimum-mute volume. This flag is set for known C-Media devices in snd_usb_mixer_fu_apply_quirk() in turn. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196669 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/mixer.c | 2 ++ sound/usb/mixer.h | 1 + sound/usb/mixer_quirks.c | 6 ++++++ 3 files changed, 9 insertions(+) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 499b03c8281d..696de5ac69be 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -541,6 +541,8 @@ int snd_usb_mixer_vol_tlv(struct snd_kcontrol *kcontrol, int op_flag, if (size < sizeof(scale)) return -ENOMEM; + if (cval->min_mute) + scale[0] = SNDRV_CTL_TLVT_DB_MINMAX_MUTE; scale[2] = cval->dBmin; scale[3] = cval->dBmax; if (copy_to_user(_tlv, scale, sizeof(scale))) diff --git a/sound/usb/mixer.h b/sound/usb/mixer.h index 3417ef347e40..2b4b067646ab 100644 --- a/sound/usb/mixer.h +++ b/sound/usb/mixer.h @@ -64,6 +64,7 @@ struct usb_mixer_elem_info { int cached; int cache_val[MAX_CHANNELS]; u8 initialized; + u8 min_mute; void *private_data; }; diff --git a/sound/usb/mixer_quirks.c b/sound/usb/mixer_quirks.c index 04991b009132..5d2fc5f58bfe 100644 --- a/sound/usb/mixer_quirks.c +++ b/sound/usb/mixer_quirks.c @@ -1873,6 +1873,12 @@ void snd_usb_mixer_fu_apply_quirk(struct usb_mixer_interface *mixer, if (unitid == 7 && cval->control == UAC_FU_VOLUME) snd_dragonfly_quirk_db_scale(mixer, cval, kctl); break; + /* lowest playback value is muted on C-Media devices */ + case USB_ID(0x0d8c, 0x000c): + case USB_ID(0x0d8c, 0x0014): + if (strstr(kctl->id.name, "Playback")) + cval->min_mute = 1; + break; } } From cc971fa12bd2dff6c0432c860d784c6cdaf5a04b Mon Sep 17 00:00:00 2001 From: zhong jiang Date: Fri, 18 Aug 2017 15:16:24 -0700 Subject: [PATCH 010/546] mm/mempolicy: fix use after free when calling get_mempolicy commit 73223e4e2e3867ebf033a5a8eb2e5df0158ccc99 upstream. I hit a use after free issue when executing trinity and repoduced it with KASAN enabled. The related call trace is as follows. BUG: KASan: use after free in SyS_get_mempolicy+0x3c8/0x960 at addr ffff8801f582d766 Read of size 2 by task syz-executor1/798 INFO: Allocated in mpol_new.part.2+0x74/0x160 age=3 cpu=1 pid=799 __slab_alloc+0x768/0x970 kmem_cache_alloc+0x2e7/0x450 mpol_new.part.2+0x74/0x160 mpol_new+0x66/0x80 SyS_mbind+0x267/0x9f0 system_call_fastpath+0x16/0x1b INFO: Freed in __mpol_put+0x2b/0x40 age=4 cpu=1 pid=799 __slab_free+0x495/0x8e0 kmem_cache_free+0x2f3/0x4c0 __mpol_put+0x2b/0x40 SyS_mbind+0x383/0x9f0 system_call_fastpath+0x16/0x1b INFO: Slab 0xffffea0009cb8dc0 objects=23 used=8 fp=0xffff8801f582de40 flags=0x200000000004080 INFO: Object 0xffff8801f582d760 @offset=5984 fp=0xffff8801f582d600 Bytes b4 ffff8801f582d750: ae 01 ff ff 00 00 00 00 5a 5a 5a 5a 5a 5a 5a 5a ........ZZZZZZZZ Object ffff8801f582d760: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b kkkkkkkkkkkkkkkk Object ffff8801f582d770: 6b 6b 6b 6b 6b 6b 6b a5 kkkkkkk. Redzone ffff8801f582d778: bb bb bb bb bb bb bb bb ........ Padding ffff8801f582d8b8: 5a 5a 5a 5a 5a 5a 5a 5a ZZZZZZZZ Memory state around the buggy address: ffff8801f582d600: fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8801f582d680: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc >ffff8801f582d700: fc fc fc fc fc fc fc fc fc fc fc fc fb fb fb fc !shared memory policy is not protected against parallel removal by other thread which is normally protected by the mmap_sem. do_get_mempolicy, however, drops the lock midway while we can still access it later. Early premature up_read is a historical artifact from times when put_user was called in this path see https://lwn.net/Articles/124754/ but that is gone since 8bccd85ffbaf ("[PATCH] Implement sys_* do_* layering in the memory policy layer."). but when we have the the current mempolicy ref count model. The issue was introduced accordingly. Fix the issue by removing the premature release. Link: http://lkml.kernel.org/r/1502950924-27521-1-git-send-email-zhongjiang@huawei.com Signed-off-by: zhong jiang Acked-by: Michal Hocko Cc: Minchan Kim Cc: Vlastimil Babka Cc: David Rientjes Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/mempolicy.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index e09b1a0e2cfe..c947014d128a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -894,11 +894,6 @@ static long do_get_mempolicy(int *policy, nodemask_t *nmask, *policy |= (pol->flags & MPOL_MODE_FLAGS); } - if (vma) { - up_read(¤t->mm->mmap_sem); - vma = NULL; - } - err = 0; if (nmask) { if (mpol_store_user_nodemask(pol)) { From 240628085effc47e86f51fc3fb37bc0e628f9a85 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Fri, 18 Aug 2017 15:16:31 -0700 Subject: [PATCH 011/546] mm: revert x86_64 and arm64 ELF_ET_DYN_BASE base changes commit c715b72c1ba406f133217b509044c38d8e714a37 upstream. Moving the x86_64 and arm64 PIE base from 0x555555554000 to 0x000100000000 broke AddressSanitizer. This is a partial revert of: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE") 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB") The AddressSanitizer tool has hard-coded expectations about where executable mappings are loaded. The motivation for changing the PIE base in the above commits was to avoid the Stack-Clash CVEs that allowed executable mappings to get too close to heap and stack. This was mainly a problem on 32-bit, but the 64-bit bases were moved too, in an effort to proactively protect those systems (proofs of concept do exist that show 64-bit collisions, but other recent changes to fix stack accounting and setuid behaviors will minimize the impact). The new 32-bit PIE base is fine for ASan (since it matches the ET_EXEC base), so only the 64-bit PIE base needs to be reverted to let x86 and arm64 ASan binaries run again. Future changes to the 64-bit PIE base on these architectures can be made optional once a more dynamic method for dealing with AddressSanitizer is found. (e.g. always loading PIE into the mmap region for marked binaries.) Link: http://lkml.kernel.org/r/20170807201542.GA21271@beast Fixes: eab09532d400 ("binfmt_elf: use ELF_ET_DYN_BASE only for PIE") Fixes: 02445990a96e ("arm64: move ELF_ET_DYN_BASE to 4GB / 4MB") Signed-off-by: Kees Cook Reported-by: Kostya Serebryany Acked-by: Will Deacon Cc: Ingo Molnar Cc: "H. Peter Anvin" Cc: Thomas Gleixner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- arch/arm64/include/asm/elf.h | 4 ++-- arch/x86/include/asm/elf.h | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h index 9e11dbe1cec3..329c127e13dc 100644 --- a/arch/arm64/include/asm/elf.h +++ b/arch/arm64/include/asm/elf.h @@ -121,10 +121,10 @@ typedef struct user_fpsimd_state elf_fpregset_t; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ -#define ELF_ET_DYN_BASE 0x100000000UL +#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) /* * When the program starts, a1 contains a pointer to a function to be diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index 07cf288b692e..bcd3d6199464 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -247,11 +247,11 @@ extern int force_personality32; /* * This is the base location for PIE (ET_DYN with INTERP) loads. On - * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * 64-bit, this is above 4GB to leave the entire 32-bit address * space open for things that want to use the area for 32-bit pointers. */ #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ - 0x100000000UL) + (TASK_SIZE / 3 * 2)) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, From c0b397fd6b2b8ed7b39a717340b85b4b1add5332 Mon Sep 17 00:00:00 2001 From: Roger Pau Monne Date: Tue, 18 Jul 2017 15:01:00 +0100 Subject: [PATCH 012/546] xen: fix bio vec merging MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 462cdace790ac2ed6aad1b19c9c0af0143b6aab0 upstream. The current test for bio vec merging is not fully accurate and can be tricked into merging bios when certain grant combinations are used. The result of these malicious bio merges is a bio that extends past the memory page used by any of the originating bios. Take into account the following scenario, where a guest creates two grant references that point to the same mfn, ie: grant 1 -> mfn A, grant 2 -> mfn A. These references are then used in a PV block request, and mapped by the backend domain, thus obtaining two different pfns that point to the same mfn, pfn B -> mfn A, pfn C -> mfn A. If those grants happen to be used in two consecutive sectors of a disk IO operation becoming two different bios in the backend domain, the checks in xen_biovec_phys_mergeable will succeed, because bfn1 == bfn2 (they both point to the same mfn). However due to the bio merging, the backend domain will end up with a bio that expands past mfn A into mfn A + 1. Fix this by making sure the check in xen_biovec_phys_mergeable takes into account the offset and the length of the bio, this basically replicates whats done in __BIOVEC_PHYS_MERGEABLE using mfns (bus addresses). While there also remove the usage of __BIOVEC_PHYS_MERGEABLE, since that's already checked by the callers of xen_biovec_phys_mergeable. Reported-by: "Jan H. Schönherr" Signed-off-by: Roger Pau Monné Reviewed-by: Juergen Gross Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- drivers/xen/biomerge.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/xen/biomerge.c b/drivers/xen/biomerge.c index 4da69dbf7dca..1bdd02a6d6ac 100644 --- a/drivers/xen/biomerge.c +++ b/drivers/xen/biomerge.c @@ -10,8 +10,7 @@ bool xen_biovec_phys_mergeable(const struct bio_vec *vec1, unsigned long bfn1 = pfn_to_bfn(page_to_pfn(vec1->bv_page)); unsigned long bfn2 = pfn_to_bfn(page_to_pfn(vec2->bv_page)); - return __BIOVEC_PHYS_MERGEABLE(vec1, vec2) && - ((bfn1 == bfn2) || ((bfn1+1) == bfn2)); + return bfn1 + PFN_DOWN(vec1->bv_offset + vec1->bv_len) == bfn2; #else /* * XXX: Add support for merging bio_vec when using different page From 64340986295dea6cf954caa630000a77775f9975 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Mon, 7 Aug 2017 19:43:13 -0700 Subject: [PATCH 013/546] x86/asm/64: Clear AC on NMI entries commit e93c17301ac55321fc18e0f8316e924e58a83c8c upstream. This closes a hole in our SMAP implementation. This patch comes from grsecurity. Good catch! Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: http://lkml.kernel.org/r/314cc9f294e8f14ed85485727556ad4f15bb1659.1502159503.git.luto@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/entry/entry_64.S | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index a55697d19824..cc0f2f5da19b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -1190,6 +1190,8 @@ ENTRY(nmi) * other IST entries. */ + ASM_CLAC + /* Use %rdx as our temp variable throughout */ pushq %rdx From ed281a6acaf1260800841fc8182e6a8b1d1b1371 Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:39 +0200 Subject: [PATCH 014/546] irqchip/atmel-aic: Fix unbalanced of_node_put() in aic_common_irq_fixup() commit 469bcef53c546bb792aa66303933272991b7831d upstream. aic_common_irq_fixup() is calling twice of_node_put() on the same node thus leading to an unbalanced refcount on the root node. Signed-off-by: Boris Brezillon Reported-by: Alexandre Belloni Fixes: b2f579b58e93 ("irqchip: atmel-aic: Add irq fixup infrastructure") Signed-off-by: Marc Zyngier Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-atmel-aic-common.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index 37199b9b2cfa..fe177224ee71 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -202,7 +202,6 @@ void __init aic_common_irq_fixup(const struct of_device_id *matches) return; match = of_match_node(matches, root); - of_node_put(root); if (match) { void (*fixup)(struct device_node *) = match->data; From b27e9ff9a5f457e85c47733387426bf522cef2aa Mon Sep 17 00:00:00 2001 From: Boris Brezillon Date: Tue, 4 Jul 2017 11:10:40 +0200 Subject: [PATCH 015/546] irqchip/atmel-aic: Fix unbalanced refcount in aic_common_rtc_irq_fixup() commit 277867ade8262583f4280cadbe90e0031a3706a7 upstream. of_find_compatible_node() is calling of_node_put() on its first argument thus leading to an unbalanced of_node_get/put() issue if the node has not been retained before that. Instead of passing the root node, pass NULL, which does exactly the same: iterate over all DT nodes, starting from the root node. Signed-off-by: Boris Brezillon Reported-by: Alexandre Belloni Fixes: 3d61467f9bab ("irqchip: atmel-aic: Implement RTC irq fixup") Signed-off-by: Marc Zyngier Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-atmel-aic-common.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/irqchip/irq-atmel-aic-common.c b/drivers/irqchip/irq-atmel-aic-common.c index fe177224ee71..831a195cb806 100644 --- a/drivers/irqchip/irq-atmel-aic-common.c +++ b/drivers/irqchip/irq-atmel-aic-common.c @@ -148,9 +148,9 @@ void __init aic_common_rtc_irq_fixup(struct device_node *root) struct device_node *np; void __iomem *regs; - np = of_find_compatible_node(root, NULL, "atmel,at91rm9200-rtc"); + np = of_find_compatible_node(NULL, NULL, "atmel,at91rm9200-rtc"); if (!np) - np = of_find_compatible_node(root, NULL, + np = of_find_compatible_node(NULL, NULL, "atmel,at91sam9x5-rtc"); if (!np) From 46d51a26efbc7cbaa2bc1f01628a00a604193856 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 20 Aug 2017 13:26:27 -0700 Subject: [PATCH 016/546] Sanitize 'move_pages()' permission checks commit 197e7e521384a23b9e585178f3f11c9fa08274b9 upstream. The 'move_paghes()' system call was introduced long long ago with the same permission checks as for sending a signal (except using CAP_SYS_NICE instead of CAP_SYS_KILL for the overriding capability). That turns out to not be a great choice - while the system call really only moves physical page allocations around (and you need other capabilities to do a lot of it), you can check the return value to map out some the virtual address choices and defeat ASLR of a binary that still shares your uid. So change the access checks to the more common 'ptrace_may_access()' model instead. This tightens the access checks for the uid, and also effectively changes the CAP_SYS_NICE check to CAP_SYS_PTRACE, but it's unlikely that anybody really _uses_ this legacy system call any more (we hav ebetter NUMA placement models these days), so I expect nobody to notice. Famous last words. Reported-by: Otto Ebeling Acked-by: Eric W. Biederman Cc: Willy Tarreau Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- mm/migrate.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/mm/migrate.c b/mm/migrate.c index 72c09dea6526..afedcfab60e2 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -38,6 +38,7 @@ #include #include #include +#include #include @@ -1483,7 +1484,6 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, const int __user *, nodes, int __user *, status, int, flags) { - const struct cred *cred = current_cred(), *tcred; struct task_struct *task; struct mm_struct *mm; int err; @@ -1507,14 +1507,9 @@ SYSCALL_DEFINE6(move_pages, pid_t, pid, unsigned long, nr_pages, /* * Check if this process has the right to modify the specified - * process. The right exists if the process has administrative - * capabilities, superuser privileges or the same - * userid as the target process. + * process. Use the regular "ptrace_may_access()" checks. */ - tcred = __task_cred(task); - if (!uid_eq(cred->euid, tcred->suid) && !uid_eq(cred->euid, tcred->uid) && - !uid_eq(cred->uid, tcred->suid) && !uid_eq(cred->uid, tcred->uid) && - !capable(CAP_SYS_NICE)) { + if (!ptrace_may_access(task, PTRACE_MODE_READ_REALCREDS)) { rcu_read_unlock(); err = -EPERM; goto out; From b4cf49024cf412a94121e61f8056636c557ead98 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 21 Aug 2017 17:35:02 +0200 Subject: [PATCH 017/546] pids: make task_tgid_nr_ns() safe commit dd1c1f2f2028a7b851f701fc6a8ebe39dcb95e7c upstream. This was reported many times, and this was even mentioned in commit 52ee2dfdd4f5 ("pids: refactor vnr/nr_ns helpers to make them safe") but somehow nobody bothered to fix the obvious problem: task_tgid_nr_ns() is not safe because task->group_leader points to nowhere after the exiting task passes exit_notify(), rcu_read_lock() can not help. We really need to change __unhash_process() to nullify group_leader, parent, and real_parent, but this needs some cleanups. Until then we can turn task_tgid_nr_ns() into another user of __task_pid_nr_ns() and fix the problem. Reported-by: Troy Kensinger Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/linux/pid.h | 4 +++- include/linux/sched.h | 50 +++++++++++++++++++++++-------------------- kernel/pid.c | 11 ++++------ 3 files changed, 34 insertions(+), 31 deletions(-) diff --git a/include/linux/pid.h b/include/linux/pid.h index 23705a53abba..97b745ddece5 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -8,7 +8,9 @@ enum pid_type PIDTYPE_PID, PIDTYPE_PGID, PIDTYPE_SID, - PIDTYPE_MAX + PIDTYPE_MAX, + /* only valid to __task_pid_nr_ns() */ + __PIDTYPE_TGID }; /* diff --git a/include/linux/sched.h b/include/linux/sched.h index eff7c1fad26f..e887c8d6f395 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1949,31 +1949,8 @@ static inline pid_t task_tgid_nr(struct task_struct *tsk) return tsk->tgid; } -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns); - -static inline pid_t task_tgid_vnr(struct task_struct *tsk) -{ - return pid_vnr(task_tgid(tsk)); -} - static inline int pid_alive(const struct task_struct *p); -static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) -{ - pid_t pid = 0; - - rcu_read_lock(); - if (pid_alive(tsk)) - pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); - rcu_read_unlock(); - - return pid; -} - -static inline pid_t task_ppid_nr(const struct task_struct *tsk) -{ - return task_ppid_nr_ns(tsk, &init_pid_ns); -} static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) @@ -1998,6 +1975,33 @@ static inline pid_t task_session_vnr(struct task_struct *tsk) return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL); } +static inline pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, ns); +} + +static inline pid_t task_tgid_vnr(struct task_struct *tsk) +{ + return __task_pid_nr_ns(tsk, __PIDTYPE_TGID, NULL); +} + +static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns) +{ + pid_t pid = 0; + + rcu_read_lock(); + if (pid_alive(tsk)) + pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns); + rcu_read_unlock(); + + return pid; +} + +static inline pid_t task_ppid_nr(const struct task_struct *tsk) +{ + return task_ppid_nr_ns(tsk, &init_pid_ns); +} + /* obsolete, do not use */ static inline pid_t task_pgrp_nr(struct task_struct *tsk) { diff --git a/kernel/pid.c b/kernel/pid.c index 78b3d9f80d44..b17263be9082 100644 --- a/kernel/pid.c +++ b/kernel/pid.c @@ -526,8 +526,11 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, if (!ns) ns = task_active_pid_ns(current); if (likely(pid_alive(task))) { - if (type != PIDTYPE_PID) + if (type != PIDTYPE_PID) { + if (type == __PIDTYPE_TGID) + type = PIDTYPE_PID; task = task->group_leader; + } nr = pid_nr_ns(rcu_dereference(task->pids[type].pid), ns); } rcu_read_unlock(); @@ -536,12 +539,6 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, } EXPORT_SYMBOL(__task_pid_nr_ns); -pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns) -{ - return pid_nr_ns(task_tgid(tsk), ns); -} -EXPORT_SYMBOL(task_tgid_nr_ns); - struct pid_namespace *task_active_pid_ns(struct task_struct *tsk) { return ns_of_pid(task_pid(tsk)); From ce1b98a30571b1022d6ce86d9876a7a7dbf9aed5 Mon Sep 17 00:00:00 2001 From: Stephane Eranian Date: Thu, 3 Dec 2015 23:33:17 +0100 Subject: [PATCH 018/546] perf/x86: Fix LBR related crashes on Intel Atom commit 6fc2e83077b05a061afe9b24f2fdff7a0434eb67 upstream. This patches fixes the LBR kernel crashes on Intel Atom. The kernel was assuming that if the CPU supports 64-bit format LBR, then it has an LBR_SELECT MSR. Atom uses 64-bit LBR format but does not have LBR_SELECT. That was causing NULL pointer dereferences in a couple of places. Signed-off-by: Stephane Eranian Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: kan.liang@intel.com Fixes: 96f3eda67fcf ("perf/x86/intel: Fix static checker warning in lbr enable") Link: http://lkml.kernel.org/r/1449182000-31524-2-git-send-email-eranian@google.com Signed-off-by: Ingo Molnar Signed-off-by: Denys Zagorui Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/cpu/perf_event_intel_lbr.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/cpu/perf_event_intel_lbr.c b/arch/x86/kernel/cpu/perf_event_intel_lbr.c index 8900400230c6..2cdae69d7e0b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_lbr.c +++ b/arch/x86/kernel/cpu/perf_event_intel_lbr.c @@ -153,7 +153,7 @@ static void __intel_pmu_lbr_enable(bool pmi) */ if (cpuc->lbr_sel) lbr_select = cpuc->lbr_sel->config; - if (!pmi) + if (!pmi && cpuc->lbr_sel) wrmsrl(MSR_LBR_SELECT, lbr_select); rdmsrl(MSR_IA32_DEBUGCTLMSR, debugctl); @@ -432,8 +432,10 @@ static void intel_pmu_lbr_read_64(struct cpu_hw_events *cpuc) int out = 0; int num = x86_pmu.lbr_nr; - if (cpuc->lbr_sel->config & LBR_CALL_STACK) - num = tos; + if (cpuc->lbr_sel) { + if (cpuc->lbr_sel->config & LBR_CALL_STACK) + num = tos; + } for (i = 0; i < num; i++) { unsigned long lbr_idx = (tos - i) & mask; From 6b45092236817efd713e92beab934fe404393324 Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Fri, 2 Jun 2017 16:36:26 +0300 Subject: [PATCH 019/546] usb: optimize acpi companion search for usb port devices commit ed18c5fa945768a9bec994e786edbbbc7695acf6 upstream. This optimization significantly reduces xhci driver load time. In ACPI tables the acpi companion port devices are children of the hub device. The port devices are identified by their port number returned by the ACPI _ADR method. _ADR 0 is reserved for the root hub device. The current implementation to find a acpi companion port device loops through all acpi port devices under that parent hub, evaluating their _ADR method each time a new port device is added. for a xHC controller with 25 ports under its roothub it will end up invoking ACPI bytecode 625 times before all ports are ready, making it really slow. The _ADR values are already read and cached earler. So instead of running the bytecode again we can check the cached _ADR value first, and then fall back to the old way. As one of the more significant changes, the xhci load time on Intel kabylake reduced by 70%, (28ms) from initcall xhci_pci_init+0x0/0x49 returned 0 after 39537 usecs to initcall xhci_pci_init+0x0/0x49 returned 0 after 11270 usecs Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/usb-acpi.c | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/usb-acpi.c b/drivers/usb/core/usb-acpi.c index 2776cfe64c09..ef9cf4a21afe 100644 --- a/drivers/usb/core/usb-acpi.c +++ b/drivers/usb/core/usb-acpi.c @@ -127,6 +127,22 @@ static enum usb_port_connect_type usb_acpi_get_connect_type(acpi_handle handle, */ #define USB_ACPI_LOCATION_VALID (1 << 31) +static struct acpi_device *usb_acpi_find_port(struct acpi_device *parent, + int raw) +{ + struct acpi_device *adev; + + if (!parent) + return NULL; + + list_for_each_entry(adev, &parent->children, node) { + if (acpi_device_adr(adev) == raw) + return adev; + } + + return acpi_find_child_device(parent, raw, false); +} + static struct acpi_device *usb_acpi_find_companion(struct device *dev) { struct usb_device *udev; @@ -174,8 +190,10 @@ static struct acpi_device *usb_acpi_find_companion(struct device *dev) int raw; raw = usb_hcd_find_raw_port_number(hcd, port1); - adev = acpi_find_child_device(ACPI_COMPANION(&udev->dev), - raw, false); + + adev = usb_acpi_find_port(ACPI_COMPANION(&udev->dev), + raw); + if (!adev) return NULL; } else { @@ -186,7 +204,9 @@ static struct acpi_device *usb_acpi_find_companion(struct device *dev) return NULL; acpi_bus_get_device(parent_handle, &adev); - adev = acpi_find_child_device(adev, port1, false); + + adev = usb_acpi_find_port(adev, port1); + if (!adev) return NULL; } From ccf1033d99834d5ead622542b325e0f063caf05e Mon Sep 17 00:00:00 2001 From: Hector Martin Date: Wed, 2 Aug 2017 00:45:44 +0900 Subject: [PATCH 020/546] usb: qmi_wwan: add D-Link DWM-222 device ID commit bed9ff165960921303a100228585f2d1691b42eb upstream. Signed-off-by: Hector Martin Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/qmi_wwan.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/usb/qmi_wwan.c b/drivers/net/usb/qmi_wwan.c index 582d8f0c6266..958af3b1af7f 100644 --- a/drivers/net/usb/qmi_wwan.c +++ b/drivers/net/usb/qmi_wwan.c @@ -707,6 +707,7 @@ static const struct usb_device_id products[] = { {QMI_FIXED_INTF(0x19d2, 0x1428, 2)}, /* Telewell TW-LTE 4G v2 */ {QMI_FIXED_INTF(0x19d2, 0x2002, 4)}, /* ZTE (Vodafone) K3765-Z */ {QMI_FIXED_INTF(0x2001, 0x7e19, 4)}, /* D-Link DWM-221 B1 */ + {QMI_FIXED_INTF(0x2001, 0x7e35, 4)}, /* D-Link DWM-222 */ {QMI_FIXED_INTF(0x0f3d, 0x68a2, 8)}, /* Sierra Wireless MC7700 */ {QMI_FIXED_INTF(0x114f, 0x68a2, 8)}, /* Sierra Wireless MC7750 */ {QMI_FIXED_INTF(0x1199, 0x68a2, 8)}, /* Sierra Wireless MC7710 in QMI mode */ From 982ce2aa79fbe7c961ee948857d5b5b2a0b2ddd9 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 24 Aug 2017 17:02:58 -0700 Subject: [PATCH 021/546] Linux 4.4.84 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7f67b35caf99..9d77ac063ec0 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 83 +SUBLEVEL = 84 EXTRAVERSION = NAME = Blurry Fish Butt From f0cd9201c0c0d1a2f31de8b18271dd9e30fa14f4 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 10:16:45 -0700 Subject: [PATCH 022/546] af_key: do not use GFP_KERNEL in atomic contexts [ Upstream commit 36f41f8fc6d8aa9f8c9072d66ff7cf9055f5e69b ] pfkey_broadcast() might be called from non process contexts, we can not use GFP_KERNEL in these cases [1]. This patch partially reverts commit ba51b6be38c1 ("net: Fix RCU splat in af_key"), only keeping the GFP_ATOMIC forcing under rcu_read_lock() section. [1] : syzkaller reported : in_atomic(): 1, irqs_disabled(): 0, pid: 2932, name: syzkaller183439 3 locks held by syzkaller183439/2932: #0: (&net->xfrm.xfrm_cfg_mutex){+.+.+.}, at: [] pfkey_sendmsg+0x4c8/0x9f0 net/key/af_key.c:3649 #1: (&pfk->dump_lock){+.+.+.}, at: [] pfkey_do_dump+0x76/0x3f0 net/key/af_key.c:293 #2: (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [] spin_lock_bh include/linux/spinlock.h:304 [inline] #2: (&(&net->xfrm.xfrm_policy_lock)->rlock){+...+.}, at: [] xfrm_policy_walk+0x192/0xa30 net/xfrm/xfrm_policy.c:1028 CPU: 0 PID: 2932 Comm: syzkaller183439 Not tainted 4.13.0-rc4+ #24 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 ___might_sleep+0x2b2/0x470 kernel/sched/core.c:5994 __might_sleep+0x95/0x190 kernel/sched/core.c:5947 slab_pre_alloc_hook mm/slab.h:416 [inline] slab_alloc mm/slab.c:3383 [inline] kmem_cache_alloc+0x24b/0x6e0 mm/slab.c:3559 skb_clone+0x1a0/0x400 net/core/skbuff.c:1037 pfkey_broadcast_one+0x4b2/0x6f0 net/key/af_key.c:207 pfkey_broadcast+0x4ba/0x770 net/key/af_key.c:281 dump_sp+0x3d6/0x500 net/key/af_key.c:2685 xfrm_policy_walk+0x2f1/0xa30 net/xfrm/xfrm_policy.c:1042 pfkey_dump_sp+0x42/0x50 net/key/af_key.c:2695 pfkey_do_dump+0xaa/0x3f0 net/key/af_key.c:299 pfkey_spddump+0x1a0/0x210 net/key/af_key.c:2722 pfkey_process+0x606/0x710 net/key/af_key.c:2814 pfkey_sendmsg+0x4d6/0x9f0 net/key/af_key.c:3650 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 ___sys_sendmsg+0x755/0x890 net/socket.c:2035 __sys_sendmsg+0xe5/0x210 net/socket.c:2069 SYSC_sendmsg net/socket.c:2080 [inline] SyS_sendmsg+0x2d/0x50 net/socket.c:2076 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x445d79 RSP: 002b:00007f32447c1dc8 EFLAGS: 00000202 ORIG_RAX: 000000000000002e RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 0000000000445d79 RDX: 0000000000000000 RSI: 000000002023dfc8 RDI: 0000000000000008 RBP: 0000000000000086 R08: 00007f32447c2700 R09: 00007f32447c2700 R10: 00007f32447c2700 R11: 0000000000000202 R12: 0000000000000000 R13: 00007ffe33edec4f R14: 00007f32447c29c0 R15: 0000000000000000 Fixes: ba51b6be38c1 ("net: Fix RCU splat in af_key") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: David Ahern Acked-by: David Ahern Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/key/af_key.c | 48 ++++++++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 22 deletions(-) diff --git a/net/key/af_key.c b/net/key/af_key.c index 2e1050ec2cf0..94bf810ad242 100644 --- a/net/key/af_key.c +++ b/net/key/af_key.c @@ -228,7 +228,7 @@ static int pfkey_broadcast_one(struct sk_buff *skb, struct sk_buff **skb2, #define BROADCAST_ONE 1 #define BROADCAST_REGISTERED 2 #define BROADCAST_PROMISC_ONLY 4 -static int pfkey_broadcast(struct sk_buff *skb, +static int pfkey_broadcast(struct sk_buff *skb, gfp_t allocation, int broadcast_flags, struct sock *one_sk, struct net *net) { @@ -278,7 +278,7 @@ static int pfkey_broadcast(struct sk_buff *skb, rcu_read_unlock(); if (one_sk != NULL) - err = pfkey_broadcast_one(skb, &skb2, GFP_KERNEL, one_sk); + err = pfkey_broadcast_one(skb, &skb2, allocation, one_sk); kfree_skb(skb2); kfree_skb(skb); @@ -311,7 +311,7 @@ static int pfkey_do_dump(struct pfkey_sock *pfk) hdr = (struct sadb_msg *) pfk->dump.skb->data; hdr->sadb_msg_seq = 0; hdr->sadb_msg_errno = rc; - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = NULL; } @@ -355,7 +355,7 @@ static int pfkey_error(const struct sadb_msg *orig, int err, struct sock *sk) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1396,7 +1396,7 @@ static int pfkey_getspi(struct sock *sk, struct sk_buff *skb, const struct sadb_ xfrm_state_put(x); - pfkey_broadcast(resp_skb, BROADCAST_ONE, sk, net); + pfkey_broadcast(resp_skb, GFP_KERNEL, BROADCAST_ONE, sk, net); return 0; } @@ -1483,7 +1483,7 @@ static int key_notify_sa(struct xfrm_state *x, const struct km_event *c) hdr->sadb_msg_seq = c->seq; hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, xs_net(x)); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xs_net(x)); return 0; } @@ -1596,7 +1596,7 @@ static int pfkey_get(struct sock *sk, struct sk_buff *skb, const struct sadb_msg out_hdr->sadb_msg_reserved = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, sock_net(sk)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, sock_net(sk)); return 0; } @@ -1701,8 +1701,8 @@ static int pfkey_register(struct sock *sk, struct sk_buff *skb, const struct sad return -ENOBUFS; } - pfkey_broadcast(supp_skb, BROADCAST_REGISTERED, sk, sock_net(sk)); - + pfkey_broadcast(supp_skb, GFP_KERNEL, BROADCAST_REGISTERED, sk, + sock_net(sk)); return 0; } @@ -1720,7 +1720,8 @@ static int unicast_flush_resp(struct sock *sk, const struct sadb_msg *ihdr) hdr->sadb_msg_errno = (uint8_t) 0; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); - return pfkey_broadcast(skb, BROADCAST_ONE, sk, sock_net(sk)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ONE, sk, + sock_net(sk)); } static int key_notify_sa_flush(const struct km_event *c) @@ -1741,7 +1742,7 @@ static int key_notify_sa_flush(const struct km_event *c) hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -1798,7 +1799,7 @@ static int dump_sa(struct xfrm_state *x, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -1886,7 +1887,7 @@ static int pfkey_promisc(struct sock *sk, struct sk_buff *skb, const struct sadb new_hdr->sadb_msg_errno = 0; } - pfkey_broadcast(skb, BROADCAST_ALL, NULL, sock_net(sk)); + pfkey_broadcast(skb, GFP_KERNEL, BROADCAST_ALL, NULL, sock_net(sk)); return 0; } @@ -2219,7 +2220,7 @@ static int key_notify_policy(struct xfrm_policy *xp, int dir, const struct km_ev out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = c->seq; out_hdr->sadb_msg_pid = c->portid; - pfkey_broadcast(out_skb, BROADCAST_ALL, NULL, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ALL, NULL, xp_net(xp)); return 0; } @@ -2439,7 +2440,7 @@ static int key_pol_get_resp(struct sock *sk, struct xfrm_policy *xp, const struc out_hdr->sadb_msg_errno = 0; out_hdr->sadb_msg_seq = hdr->sadb_msg_seq; out_hdr->sadb_msg_pid = hdr->sadb_msg_pid; - pfkey_broadcast(out_skb, BROADCAST_ONE, sk, xp_net(xp)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_ONE, sk, xp_net(xp)); err = 0; out: @@ -2695,7 +2696,7 @@ static int dump_sp(struct xfrm_policy *xp, int dir, int count, void *ptr) out_hdr->sadb_msg_pid = pfk->dump.msg_portid; if (pfk->dump.skb) - pfkey_broadcast(pfk->dump.skb, BROADCAST_ONE, + pfkey_broadcast(pfk->dump.skb, GFP_ATOMIC, BROADCAST_ONE, &pfk->sk, sock_net(&pfk->sk)); pfk->dump.skb = out_skb; @@ -2752,7 +2753,7 @@ static int key_notify_policy_flush(const struct km_event *c) hdr->sadb_msg_satype = SADB_SATYPE_UNSPEC; hdr->sadb_msg_len = (sizeof(struct sadb_msg) / sizeof(uint64_t)); hdr->sadb_msg_reserved = 0; - pfkey_broadcast(skb_out, BROADCAST_ALL, NULL, c->net); + pfkey_broadcast(skb_out, GFP_ATOMIC, BROADCAST_ALL, NULL, c->net); return 0; } @@ -2814,7 +2815,7 @@ static int pfkey_process(struct sock *sk, struct sk_buff *skb, const struct sadb void *ext_hdrs[SADB_EXT_MAX]; int err; - pfkey_broadcast(skb_clone(skb, GFP_KERNEL), + pfkey_broadcast(skb_clone(skb, GFP_KERNEL), GFP_KERNEL, BROADCAST_PROMISC_ONLY, NULL, sock_net(sk)); memset(ext_hdrs, 0, sizeof(ext_hdrs)); @@ -3036,7 +3037,8 @@ static int key_notify_sa_expire(struct xfrm_state *x, const struct km_event *c) out_hdr->sadb_msg_seq = 0; out_hdr->sadb_msg_pid = 0; - pfkey_broadcast(out_skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + pfkey_broadcast(out_skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); return 0; } @@ -3226,7 +3228,8 @@ static int pfkey_send_acquire(struct xfrm_state *x, struct xfrm_tmpl *t, struct xfrm_ctx->ctx_len); } - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } static struct xfrm_policy *pfkey_compile_policy(struct sock *sk, int opt, @@ -3424,7 +3427,8 @@ static int pfkey_send_new_mapping(struct xfrm_state *x, xfrm_address_t *ipaddr, n_port->sadb_x_nat_t_port_port = sport; n_port->sadb_x_nat_t_port_reserved = 0; - return pfkey_broadcast(skb, BROADCAST_REGISTERED, NULL, xs_net(x)); + return pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_REGISTERED, NULL, + xs_net(x)); } #ifdef CONFIG_NET_KEY_MIGRATE @@ -3616,7 +3620,7 @@ static int pfkey_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, } /* broadcast migrate message to sockets */ - pfkey_broadcast(skb, BROADCAST_ALL, NULL, &init_net); + pfkey_broadcast(skb, GFP_ATOMIC, BROADCAST_ALL, NULL, &init_net); return 0; From c65eca7ddd88c6af4761f9d965fa600a0a9557cc Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Mon, 14 Aug 2017 14:10:25 -0700 Subject: [PATCH 023/546] dccp: purge write queue in dccp_destroy_sock() [ Upstream commit 7749d4ff88d31b0be17c8683143135adaaadc6a7 ] syzkaller reported that DCCP could have a non empty write queue at dismantle time. WARNING: CPU: 1 PID: 2953 at net/core/stream.c:199 sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199 Kernel panic - not syncing: panic_on_warn set ... CPU: 1 PID: 2953 Comm: syz-executor0 Not tainted 4.13.0-rc4+ #2 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 panic+0x1e4/0x417 kernel/panic.c:180 __warn+0x1c4/0x1d9 kernel/panic.c:541 report_bug+0x211/0x2d0 lib/bug.c:183 fixup_bug+0x40/0x90 arch/x86/kernel/traps.c:190 do_trap_no_signal arch/x86/kernel/traps.c:224 [inline] do_trap+0x260/0x390 arch/x86/kernel/traps.c:273 do_error_trap+0x120/0x390 arch/x86/kernel/traps.c:310 do_invalid_op+0x1b/0x20 arch/x86/kernel/traps.c:323 invalid_op+0x1e/0x30 arch/x86/entry/entry_64.S:846 RIP: 0010:sk_stream_kill_queues+0x3ce/0x520 net/core/stream.c:199 RSP: 0018:ffff8801d182f108 EFLAGS: 00010297 RAX: ffff8801d1144140 RBX: ffff8801d13cb280 RCX: 0000000000000000 RDX: 0000000000000000 RSI: ffffffff85137b00 RDI: ffff8801d13cb280 RBP: ffff8801d182f148 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801d13cb4d0 R13: ffff8801d13cb3b8 R14: ffff8801d13cb300 R15: ffff8801d13cb3b8 inet_csk_destroy_sock+0x175/0x3f0 net/ipv4/inet_connection_sock.c:835 dccp_close+0x84d/0xc10 net/dccp/proto.c:1067 inet_release+0xed/0x1c0 net/ipv4/af_inet.c:425 sock_release+0x8d/0x1e0 net/socket.c:597 sock_close+0x16/0x20 net/socket.c:1126 __fput+0x327/0x7e0 fs/file_table.c:210 ____fput+0x15/0x20 fs/file_table.c:246 task_work_run+0x18a/0x260 kernel/task_work.c:116 exit_task_work include/linux/task_work.h:21 [inline] do_exit+0xa32/0x1b10 kernel/exit.c:865 do_group_exit+0x149/0x400 kernel/exit.c:969 get_signal+0x7e8/0x17e0 kernel/signal.c:2330 do_signal+0x94/0x1ee0 arch/x86/kernel/signal.c:808 exit_to_usermode_loop+0x21c/0x2d0 arch/x86/entry/common.c:157 prepare_exit_to_usermode arch/x86/entry/common.c:194 [inline] syscall_return_slowpath+0x3a7/0x450 arch/x86/entry/common.c:263 Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/proto.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 9fe25bf63296..86bc40ba6ba5 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -201,10 +201,7 @@ void dccp_destroy_sock(struct sock *sk) { struct dccp_sock *dp = dccp_sk(sk); - /* - * DCCP doesn't use sk_write_queue, just sk_send_head - * for retransmissions - */ + __skb_queue_purge(&sk->sk_write_queue); if (sk->sk_send_head != NULL) { kfree_skb(sk->sk_send_head); sk->sk_send_head = NULL; From c207ec46b3010d147d9b1363849fe43a818fa696 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 07:03:15 -0700 Subject: [PATCH 024/546] dccp: defer ccid_hc_tx_delete() at dismantle time [ Upstream commit 120e9dabaf551c6dc03d3a10a1f026376cb1811c ] syszkaller team reported another problem in DCCP [1] Problem here is that the structure holding RTO timer (ccid2_hc_tx_rto_expire() handler) is freed too soon. We can not use del_timer_sync() to cancel the timer since this timer wants to grab socket lock (that would risk a dead lock) Solution is to defer the freeing of memory when all references to the socket were released. Socket timers do own a reference, so this should fix the issue. [1] ================================================================== BUG: KASAN: use-after-free in ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144 Read of size 4 at addr ffff8801d2660540 by task kworker/u4:7/3365 CPU: 1 PID: 3365 Comm: kworker/u4:7 Not tainted 4.13.0-rc4+ #3 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Workqueue: events_unbound call_usermodehelper_exec_work Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load4_noabort+0x14/0x20 mm/kasan/report.c:429 ccid2_hc_tx_rto_expire+0x51c/0x5c0 net/dccp/ccids/ccid2.c:144 call_timer_fn+0x233/0x830 kernel/time/timer.c:1268 expire_timers kernel/time/timer.c:1307 [inline] __run_timers+0x7fd/0xb90 kernel/time/timer.c:1601 run_timer_softirq+0x21/0x80 kernel/time/timer.c:1614 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284 invoke_softirq kernel/softirq.c:364 [inline] irq_exit+0x1cc/0x200 kernel/softirq.c:405 exiting_irq arch/x86/include/asm/apic.h:638 [inline] smp_apic_timer_interrupt+0x76/0xa0 arch/x86/kernel/apic/apic.c:1044 apic_timer_interrupt+0x93/0xa0 arch/x86/entry/entry_64.S:702 RIP: 0010:arch_local_irq_enable arch/x86/include/asm/paravirt.h:824 [inline] RIP: 0010:__raw_write_unlock_irq include/linux/rwlock_api_smp.h:267 [inline] RIP: 0010:_raw_write_unlock_irq+0x56/0x70 kernel/locking/spinlock.c:343 RSP: 0018:ffff8801cd50eaa8 EFLAGS: 00000286 ORIG_RAX: ffffffffffffff10 RAX: dffffc0000000000 RBX: ffffffff85a090c0 RCX: 0000000000000006 RDX: 1ffffffff0b595f3 RSI: 1ffff1003962f989 RDI: ffffffff85acaf98 RBP: ffff8801cd50eab0 R08: 0000000000000001 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000000 R12: ffff8801cc96ea60 R13: dffffc0000000000 R14: ffff8801cc96e4c0 R15: ffff8801cc96e4c0 release_task+0xe9e/0x1a40 kernel/exit.c:220 wait_task_zombie kernel/exit.c:1162 [inline] wait_consider_task+0x29b8/0x33c0 kernel/exit.c:1389 do_wait_thread kernel/exit.c:1452 [inline] do_wait+0x441/0xa90 kernel/exit.c:1523 kernel_wait4+0x1f5/0x370 kernel/exit.c:1665 SYSC_wait4+0x134/0x140 kernel/exit.c:1677 SyS_wait4+0x2c/0x40 kernel/exit.c:1673 call_usermodehelper_exec_sync kernel/kmod.c:286 [inline] call_usermodehelper_exec_work+0x1a0/0x2c0 kernel/kmod.c:323 process_one_work+0xbf3/0x1bc0 kernel/workqueue.c:2097 worker_thread+0x223/0x1860 kernel/workqueue.c:2231 kthread+0x35e/0x430 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:425 Allocated by task 21267: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc+0x127/0x750 mm/slab.c:3561 ccid_new+0x20e/0x390 net/dccp/ccid.c:151 dccp_hdlr_ccid+0x27/0x140 net/dccp/feat.c:44 __dccp_feat_activate+0x142/0x2a0 net/dccp/feat.c:344 dccp_feat_activate_values+0x34e/0xa90 net/dccp/feat.c:1538 dccp_rcv_request_sent_state_process net/dccp/input.c:472 [inline] dccp_rcv_state_process+0xed1/0x1620 net/dccp/input.c:677 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679 sk_backlog_rcv include/net/sock.h:911 [inline] __release_sock+0x124/0x360 net/core/sock.c:2269 release_sock+0xa4/0x2a0 net/core/sock.c:2784 inet_wait_for_connect net/ipv4/af_inet.c:557 [inline] __inet_stream_connect+0x671/0xf00 net/ipv4/af_inet.c:643 inet_stream_connect+0x58/0xa0 net/ipv4/af_inet.c:682 SYSC_connect+0x204/0x470 net/socket.c:1642 SyS_connect+0x24/0x30 net/socket.c:1623 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 3049: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3763 ccid_hc_tx_delete+0xc5/0x100 net/dccp/ccid.c:190 dccp_destroy_sock+0x1d1/0x2b0 net/dccp/proto.c:225 inet_csk_destroy_sock+0x166/0x3f0 net/ipv4/inet_connection_sock.c:833 dccp_done+0xb7/0xd0 net/dccp/proto.c:145 dccp_time_wait+0x13d/0x300 net/dccp/minisocks.c:72 dccp_rcv_reset+0x1d1/0x5b0 net/dccp/input.c:160 dccp_rcv_state_process+0x8fc/0x1620 net/dccp/input.c:663 dccp_v4_do_rcv+0xeb/0x160 net/dccp/ipv4.c:679 sk_backlog_rcv include/net/sock.h:911 [inline] __sk_receive_skb+0x33e/0xc00 net/core/sock.c:521 dccp_v4_rcv+0xef1/0x1c00 net/dccp/ipv4.c:871 ip_local_deliver_finish+0x2e2/0xba0 net/ipv4/ip_input.c:216 NF_HOOK include/linux/netfilter.h:248 [inline] ip_local_deliver+0x1ce/0x6d0 net/ipv4/ip_input.c:257 dst_input include/net/dst.h:477 [inline] ip_rcv_finish+0x8db/0x19c0 net/ipv4/ip_input.c:397 NF_HOOK include/linux/netfilter.h:248 [inline] ip_rcv+0xc3f/0x17d0 net/ipv4/ip_input.c:488 __netif_receive_skb_core+0x19af/0x33d0 net/core/dev.c:4417 __netif_receive_skb+0x2c/0x1b0 net/core/dev.c:4455 process_backlog+0x203/0x740 net/core/dev.c:5130 napi_poll net/core/dev.c:5527 [inline] net_rx_action+0x792/0x1910 net/core/dev.c:5593 __do_softirq+0x2f5/0xba3 kernel/softirq.c:284 The buggy address belongs to the object at ffff8801d2660100 which belongs to the cache ccid2_hc_tx_sock of size 1240 The buggy address is located 1088 bytes inside of 1240-byte region [ffff8801d2660100, ffff8801d26605d8) The buggy address belongs to the page: page:ffffea0007499800 count:1 mapcount:0 mapping:ffff8801d2660100 index:0x0 compound_mapcount: 0 flags: 0x200000000008100(slab|head) raw: 0200000000008100 ffff8801d2660100 0000000000000000 0000000100000005 raw: ffffea00075271a0 ffffea0007538820 ffff8801d3aef9c0 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801d2660400: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ffff8801d2660480: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8801d2660500: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb ^ ffff8801d2660580: fb fb fb fb fb fb fb fb fb fb fb fc fc fc fc fc ffff8801d2660600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Gerrit Renker Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/dccp/proto.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/net/dccp/proto.c b/net/dccp/proto.c index 86bc40ba6ba5..b68168fcc06a 100644 --- a/net/dccp/proto.c +++ b/net/dccp/proto.c @@ -24,6 +24,7 @@ #include #include +#include #include #include @@ -170,6 +171,15 @@ const char *dccp_packet_name(const int type) EXPORT_SYMBOL_GPL(dccp_packet_name); +static void dccp_sk_destruct(struct sock *sk) +{ + struct dccp_sock *dp = dccp_sk(sk); + + ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); + dp->dccps_hc_tx_ccid = NULL; + inet_sock_destruct(sk); +} + int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) { struct dccp_sock *dp = dccp_sk(sk); @@ -179,6 +189,7 @@ int dccp_init_sock(struct sock *sk, const __u8 ctl_sock_initialized) icsk->icsk_syn_retries = sysctl_dccp_request_retries; sk->sk_state = DCCP_CLOSED; sk->sk_write_space = dccp_write_space; + sk->sk_destruct = dccp_sk_destruct; icsk->icsk_sync_mss = dccp_sync_mss; dp->dccps_mss_cache = 536; dp->dccps_rate_last = jiffies; @@ -219,8 +230,7 @@ void dccp_destroy_sock(struct sock *sk) dp->dccps_hc_rx_ackvec = NULL; } ccid_hc_rx_delete(dp->dccps_hc_rx_ccid, sk); - ccid_hc_tx_delete(dp->dccps_hc_tx_ccid, sk); - dp->dccps_hc_rx_ccid = dp->dccps_hc_tx_ccid = NULL; + dp->dccps_hc_rx_ccid = NULL; /* clean up feature negotiation state */ dccp_feat_list_purge(&dp->dccps_featneg); From 114414b8547525709851b5901b41fda25f9382b1 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Tue, 15 Aug 2017 05:26:17 -0700 Subject: [PATCH 025/546] ipv4: fix NULL dereference in free_fib_info_rcu() [ Upstream commit 187e5b3ac84d3421d2de3aca949b2791fbcad554 ] If fi->fib_metrics could not be allocated in fib_create_info() we attempt to dereference a NULL pointer in free_fib_info_rcu() : m = fi->fib_metrics; if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt)) kfree(m); Before my recent patch, we used to call kfree(NULL) and nothing wrong happened. Instead of using RCU to defer freeing while we are under memory stress, it seems better to take immediate action. This was reported by syzkaller team. Fixes: 3fb07daff8e9 ("ipv4: add reference counting to metrics") Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/fib_semantics.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index b2504712259f..313e3c11a15a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -1044,15 +1044,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL); if (!fi) goto failure; - fib_info_cnt++; if (cfg->fc_mx) { fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL); - if (!fi->fib_metrics) - goto failure; + if (unlikely(!fi->fib_metrics)) { + kfree(fi); + return ERR_PTR(err); + } atomic_set(&fi->fib_metrics->refcnt, 1); - } else + } else { fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics; - + } + fib_info_cnt++; fi->fib_net = net; fi->fib_protocol = cfg->fc_protocol; fi->fib_scope = cfg->fc_scope; From 7e1fe0062c24b1cdfb58fb494d03741a6b0a4ac8 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Tue, 15 Aug 2017 16:37:04 +0300 Subject: [PATCH 026/546] net_sched/sfq: update hierarchical backlog when drop packet [ Upstream commit 325d5dc3f7e7c2840b65e4a2988c082c2c0025c5 ] When sfq_enqueue() drops head packet or packet from another queue it have to update backlog at upper qdiscs too. Fixes: 2ccccf5fb43f ("net_sched: update hierarchical backlog too") Signed-off-by: Konstantin Khlebnikov Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/sch_sfq.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c index 4431e2833e45..3f2c3eed04da 100644 --- a/net/sched/sch_sfq.c +++ b/net/sched/sch_sfq.c @@ -434,6 +434,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) qdisc_drop(head, sch); slot_queue_add(slot, skb); + qdisc_tree_reduce_backlog(sch, 0, delta); return NET_XMIT_CN; } @@ -465,8 +466,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch) /* Return Congestion Notification only if we dropped a packet * from this flow. */ - if (qlen != slot->qlen) + if (qlen != slot->qlen) { + qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb)); return NET_XMIT_CN; + } /* As we dropped a packet, better let upper stack know this */ qdisc_tree_reduce_backlog(sch, 1, dropped); From 1bd54371388c0c1e24e3ffa8afde9e130c5799b9 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 11:09:12 -0700 Subject: [PATCH 027/546] ipv4: better IP_MAX_MTU enforcement [ Upstream commit c780a049f9bf442314335372c9abc4548bfe3e44 ] While working on yet another syzkaller report, I found that our IP_MAX_MTU enforcements were not properly done. gcc seems to reload dev->mtu for min(dev->mtu, IP_MAX_MTU), and final result can be bigger than IP_MAX_MTU :/ This is a problem because device mtu can be changed on other cpus or threads. While this patch does not fix the issue I am working on, it is probably worth addressing it. Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip.h | 4 ++-- net/ipv4/route.c | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/include/net/ip.h b/include/net/ip.h index b450d8653b30..7476bb10ff37 100644 --- a/include/net/ip.h +++ b/include/net/ip.h @@ -314,7 +314,7 @@ static inline unsigned int ip_dst_mtu_maybe_forward(const struct dst_entry *dst, !forwarding) return dst_mtu(dst); - return min(dst->dev->mtu, IP_MAX_MTU); + return min(READ_ONCE(dst->dev->mtu), IP_MAX_MTU); } static inline unsigned int ip_skb_dst_mtu(const struct sk_buff *skb) @@ -327,7 +327,7 @@ static inline unsigned int ip_skb_dst_mtu(const struct sk_buff *skb) return ip_dst_mtu_maybe_forward(skb_dst(skb), forwarding); } - return min(skb_dst(skb)->dev->mtu, IP_MAX_MTU); + return min(READ_ONCE(skb_dst(skb)->dev->mtu), IP_MAX_MTU); } u32 ip_idents_reserve(u32 hash, int segs); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index c295d882c6e0..0294f7c99c85 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1241,7 +1241,7 @@ static unsigned int ipv4_mtu(const struct dst_entry *dst) if (mtu) return mtu; - mtu = dst->dev->mtu; + mtu = READ_ONCE(dst->dev->mtu); if (unlikely(dst_metric_locked(dst, RTAX_MTU))) { if (rt->rt_uses_gateway && mtu > 576) From 0e8d62861552b5691e97d61ecee226000a0519ad Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Wed, 16 Aug 2017 20:16:40 +0200 Subject: [PATCH 028/546] sctp: fully initialize the IPv6 address in sctp_v6_to_addr() [ Upstream commit 15339e441ec46fbc3bf3486bb1ae4845b0f1bb8d ] KMSAN reported use of uninitialized sctp_addr->v4.sin_addr.s_addr and sctp_addr->v6.sin6_scope_id in sctp_v6_cmp_addr() (see below). Make sure all fields of an IPv6 address are initialized, which guarantees that the IPv4 fields are also initialized. ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x172/0x1c0 lib/dump_stack.c:42 is_logbuf_locked mm/kmsan/kmsan.c:59 [inline] kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938 native_save_fl arch/x86/include/asm/irqflags.h:18 [inline] arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline] arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline] __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467 sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651 sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg net/socket.c:643 [inline] SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x13/0x94 RIP: 0033:0x44b479 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000 origin description: ----dst_saddr@sctp_v6_get_dst local variable created at: sk_fullsock include/net/sock.h:2321 [inline] inet6_sk include/linux/ipv6.h:309 [inline] sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 ================================================================== BUG: KMSAN: use of uninitialized memory in sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 CPU: 2 PID: 31056 Comm: syz-executor1 Not tainted 4.11.0-rc5+ #2944 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x172/0x1c0 lib/dump_stack.c:42 is_logbuf_locked mm/kmsan/kmsan.c:59 [inline] kmsan_report+0x12a/0x180 mm/kmsan/kmsan.c:938 native_save_fl arch/x86/include/asm/irqflags.h:18 [inline] arch_local_save_flags arch/x86/include/asm/irqflags.h:72 [inline] arch_local_irq_save arch/x86/include/asm/irqflags.h:113 [inline] __msan_warning_32+0x61/0xb0 mm/kmsan/kmsan_instr.c:467 sctp_v6_cmp_addr+0x8d4/0x9f0 net/sctp/ipv6.c:517 sctp_v6_get_dst+0x8c7/0x1630 net/sctp/ipv6.c:290 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 sctp_assoc_add_peer+0x66d/0x16f0 net/sctp/associola.c:651 sctp_sendmsg+0x35a5/0x4f90 net/sctp/socket.c:1871 inet_sendmsg+0x498/0x670 net/ipv4/af_inet.c:762 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg net/socket.c:643 [inline] SYSC_sendto+0x608/0x710 net/socket.c:1696 SyS_sendto+0x8a/0xb0 net/socket.c:1664 entry_SYSCALL_64_fastpath+0x13/0x94 RIP: 0033:0x44b479 RSP: 002b:00007f6213f21c08 EFLAGS: 00000286 ORIG_RAX: 000000000000002c RAX: ffffffffffffffda RBX: 0000000020000000 RCX: 000000000044b479 RDX: 0000000000000041 RSI: 0000000020edd000 RDI: 0000000000000006 RBP: 00000000007080a8 R08: 0000000020b85fe4 R09: 000000000000001c R10: 0000000000040005 R11: 0000000000000286 R12: 00000000ffffffff R13: 0000000000003760 R14: 00000000006e5820 R15: 0000000000ff8000 origin description: ----dst_saddr@sctp_v6_get_dst local variable created at: sk_fullsock include/net/sock.h:2321 [inline] inet6_sk include/linux/ipv6.h:309 [inline] sctp_v6_get_dst+0x91/0x1630 net/sctp/ipv6.c:241 sctp_transport_route+0x101/0x570 net/sctp/transport.c:292 ================================================================== Signed-off-by: Alexander Potapenko Reviewed-by: Xin Long Acked-by: Marcelo Ricardo Leitner Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sctp/ipv6.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c index 7527c168e471..e33e9bd4ed5a 100644 --- a/net/sctp/ipv6.c +++ b/net/sctp/ipv6.c @@ -510,7 +510,9 @@ static void sctp_v6_to_addr(union sctp_addr *addr, struct in6_addr *saddr, { addr->sa.sa_family = AF_INET6; addr->v6.sin6_port = port; + addr->v6.sin6_flowinfo = 0; addr->v6.sin6_addr = *saddr; + addr->v6.sin6_scope_id = 0; } /* Compare addresses exactly. From 69827c395d25abc61023b96cbddaa3af1f3acea6 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Wed, 16 Aug 2017 09:41:54 -0700 Subject: [PATCH 029/546] tipc: fix use-after-free [ Upstream commit 5bfd37b4de5c98e86b12bd13be5aa46c7484a125 ] syszkaller reported use-after-free in tipc [1] When msg->rep skb is freed, set the pointer to NULL, so that caller does not free it again. [1] ================================================================== BUG: KASAN: use-after-free in skb_push+0xd4/0xe0 net/core/skbuff.c:1466 Read of size 8 at addr ffff8801c6e71e90 by task syz-executor5/4115 CPU: 1 PID: 4115 Comm: syz-executor5 Not tainted 4.13.0-rc4+ #32 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:16 [inline] dump_stack+0x194/0x257 lib/dump_stack.c:52 print_address_description+0x73/0x250 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 [inline] kasan_report+0x24e/0x340 mm/kasan/report.c:409 __asan_report_load8_noabort+0x14/0x20 mm/kasan/report.c:430 skb_push+0xd4/0xe0 net/core/skbuff.c:1466 tipc_nl_compat_recv+0x833/0x18f0 net/tipc/netlink_compat.c:1209 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x4512e9 RSP: 002b:00007f3bc8184c08 EFLAGS: 00000216 ORIG_RAX: 0000000000000001 RAX: ffffffffffffffda RBX: 0000000000718000 RCX: 00000000004512e9 RDX: 0000000000000020 RSI: 0000000020fdb000 RDI: 0000000000000006 RBP: 0000000000000086 R08: 0000000000000000 R09: 0000000000000000 R10: 0000000000000000 R11: 0000000000000216 R12: 00000000004b5e76 R13: 00007f3bc8184b48 R14: 00000000004b5e86 R15: 0000000000000000 Allocated by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kasan_slab_alloc+0x12/0x20 mm/kasan/kasan.c:489 kmem_cache_alloc_node+0x13d/0x750 mm/slab.c:3651 __alloc_skb+0xf1/0x740 net/core/skbuff.c:219 alloc_skb include/linux/skbuff.h:903 [inline] tipc_tlv_alloc+0x26/0xb0 net/tipc/netlink_compat.c:148 tipc_nl_compat_dumpit+0xf2/0x3c0 net/tipc/netlink_compat.c:248 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline] tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe Freed by task 4115: save_stack_trace+0x16/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 [inline] kasan_slab_free+0x71/0xc0 mm/kasan/kasan.c:524 __cache_free mm/slab.c:3503 [inline] kmem_cache_free+0x77/0x280 mm/slab.c:3763 kfree_skbmem+0x1a1/0x1d0 net/core/skbuff.c:622 __kfree_skb net/core/skbuff.c:682 [inline] kfree_skb+0x165/0x4c0 net/core/skbuff.c:699 tipc_nl_compat_dumpit+0x36a/0x3c0 net/tipc/netlink_compat.c:260 tipc_nl_compat_handle net/tipc/netlink_compat.c:1130 [inline] tipc_nl_compat_recv+0x756/0x18f0 net/tipc/netlink_compat.c:1199 genl_family_rcv_msg+0x7b7/0xfb0 net/netlink/genetlink.c:598 genl_rcv_msg+0xb2/0x140 net/netlink/genetlink.c:623 netlink_rcv_skb+0x216/0x440 net/netlink/af_netlink.c:2397 genl_rcv+0x28/0x40 net/netlink/genetlink.c:634 netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline] netlink_unicast+0x4e8/0x6f0 net/netlink/af_netlink.c:1291 netlink_sendmsg+0xa4a/0xe60 net/netlink/af_netlink.c:1854 sock_sendmsg_nosec net/socket.c:633 [inline] sock_sendmsg+0xca/0x110 net/socket.c:643 sock_write_iter+0x31a/0x5d0 net/socket.c:898 call_write_iter include/linux/fs.h:1743 [inline] new_sync_write fs/read_write.c:457 [inline] __vfs_write+0x684/0x970 fs/read_write.c:470 vfs_write+0x189/0x510 fs/read_write.c:518 SYSC_write fs/read_write.c:565 [inline] SyS_write+0xef/0x220 fs/read_write.c:557 entry_SYSCALL_64_fastpath+0x1f/0xbe The buggy address belongs to the object at ffff8801c6e71dc0 which belongs to the cache skbuff_head_cache of size 224 The buggy address is located 208 bytes inside of 224-byte region [ffff8801c6e71dc0, ffff8801c6e71ea0) The buggy address belongs to the page: page:ffffea00071b9c40 count:1 mapcount:0 mapping:ffff8801c6e71000 index:0x0 flags: 0x200000000000100(slab) raw: 0200000000000100 ffff8801c6e71000 0000000000000000 000000010000000c raw: ffffea0007224a20 ffff8801d98caf48 ffff8801d9e79040 0000000000000000 page dumped because: kasan: bad access detected Memory state around the buggy address: ffff8801c6e71d80: fc fc fc fc fc fc fc fc fb fb fb fb fb fb fb fb ffff8801c6e71e00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb >ffff8801c6e71e80: fb fb fb fb fc fc fc fc fc fc fc fc fc fc fc fc ^ ffff8801c6e71f00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ffff8801c6e71f80: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc ================================================================== Signed-off-by: Eric Dumazet Reported-by: Dmitry Vyukov Cc: Jon Maloy Cc: Ying Xue Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/tipc/netlink_compat.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c index a0c90572d0e5..f86c6555a539 100644 --- a/net/tipc/netlink_compat.c +++ b/net/tipc/netlink_compat.c @@ -258,13 +258,15 @@ static int tipc_nl_compat_dumpit(struct tipc_nl_compat_cmd_dump *cmd, arg = nlmsg_new(0, GFP_KERNEL); if (!arg) { kfree_skb(msg->rep); + msg->rep = NULL; return -ENOMEM; } err = __tipc_nl_compat_dumpit(cmd, msg, arg); - if (err) + if (err) { kfree_skb(msg->rep); - + msg->rep = NULL; + } kfree_skb(arg); return err; From 6415a71ddf15b54939937581a5e35a4ab07883a0 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Wed, 16 Aug 2017 11:18:09 -0700 Subject: [PATCH 030/546] ipv6: reset fn->rr_ptr when replacing route [ Upstream commit 383143f31d7d3525a1dbff733d52fff917f82f15 ] syzcaller reported the following use-after-free issue in rt6_select(): BUG: KASAN: use-after-free in rt6_select net/ipv6/route.c:755 [inline] at addr ffff8800bc6994e8 BUG: KASAN: use-after-free in ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 at addr ffff8800bc6994e8 Read of size 4 by task syz-executor1/439628 CPU: 0 PID: 439628 Comm: syz-executor1 Not tainted 4.3.5+ #8 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 0000000000000000 ffff88018fe435b0 ffffffff81ca384d ffff8801d3588c00 ffff8800bc699380 ffff8800bc699500 dffffc0000000000 ffff8801d40a47c0 ffff88018fe435d8 ffffffff81735751 ffff88018fe43660 ffff8800bc699380 Call Trace: [] __dump_stack lib/dump_stack.c:15 [inline] [] dump_stack+0xc1/0x124 lib/dump_stack.c:51 sctp: [Deprecated]: syz-executor0 (pid 439615) Use of struct sctp_assoc_value in delayed_ack socket option. Use struct sctp_sack_info instead [] kasan_object_err+0x21/0x70 mm/kasan/report.c:158 [] print_address_description mm/kasan/report.c:196 [inline] [] kasan_report_error+0x1b4/0x4a0 mm/kasan/report.c:285 [] kasan_report mm/kasan/report.c:305 [inline] [] __asan_report_load4_noabort+0x43/0x50 mm/kasan/report.c:325 [] rt6_select net/ipv6/route.c:755 [inline] [] ip6_pol_route.isra.46+0x1429/0x1470 net/ipv6/route.c:1084 [] ip6_pol_route_output+0x81/0xb0 net/ipv6/route.c:1203 [] fib6_rule_action+0x1f0/0x680 net/ipv6/fib6_rules.c:95 [] fib_rules_lookup+0x2a6/0x7a0 net/core/fib_rules.c:223 [] fib6_rule_lookup+0xd0/0x250 net/ipv6/fib6_rules.c:41 [] ip6_route_output+0x1d6/0x2c0 net/ipv6/route.c:1224 [] ip6_dst_lookup_tail+0x4d2/0x890 net/ipv6/ip6_output.c:943 [] ip6_dst_lookup_flow+0x9a/0x250 net/ipv6/ip6_output.c:1079 [] ip6_datagram_dst_update+0x538/0xd40 net/ipv6/datagram.c:91 [] __ip6_datagram_connect net/ipv6/datagram.c:251 [inline] [] ip6_datagram_connect+0x518/0xe50 net/ipv6/datagram.c:272 [] ip6_datagram_connect_v6_only+0x63/0x90 net/ipv6/datagram.c:284 [] inet_dgram_connect+0x170/0x1f0 net/ipv4/af_inet.c:564 [] SYSC_connect+0x1a7/0x2f0 net/socket.c:1582 [] SyS_connect+0x29/0x30 net/socket.c:1563 [] entry_SYSCALL_64_fastpath+0x12/0x17 Object at ffff8800bc699380, in cache ip6_dst_cache size: 384 The root cause of it is that in fib6_add_rt2node(), when it replaces an existing route with the new one, it does not update fn->rr_ptr. This commit resets fn->rr_ptr to NULL when it points to a route which is replaced in fib6_add_rt2node(). Fixes: 27596472473a ("ipv6: fix ECMP route replacement") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_fib.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index f60e8caea767..bd36ecc8d15c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -892,6 +892,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, } nsiblings = iter->rt6i_nsiblings; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); if (nsiblings) { @@ -904,6 +906,8 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; fib6_purge_rt(iter, fn, info->nl_net); + if (fn->rr_ptr == iter) + fn->rr_ptr = NULL; rt6_release(iter); nsiblings--; } else { From ece3ff173731fa87fec618fbf8734ca867eb8e42 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 18 Aug 2017 17:14:49 -0700 Subject: [PATCH 031/546] ipv6: repair fib6 tree in failure case [ Upstream commit 348a4002729ccab8b888b38cbc099efa2f2a2036 ] In fib6_add(), it is possible that fib6_add_1() picks an intermediate node and sets the node's fn->leaf to NULL in order to add this new route. However, if fib6_add_rt2node() fails to add the new route for some reason, fn->leaf will be left as NULL and could potentially cause crash when fn->leaf is accessed in fib6_locate(). This patch makes sure fib6_repair_tree() is called to properly repair fn->leaf in the above failure case. Here is the syzkaller reported general protection fault in fib6_locate: kasan: CONFIG_KASAN_INLINE enabled kasan: GPF could be caused by NULL-ptr deref or user memory access general protection fault: 0000 [#1] SMP KASAN Modules linked in: CPU: 0 PID: 40937 Comm: syz-executor3 Not tainted Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 task: ffff8801d7d64100 ti: ffff8801d01a0000 task.ti: ffff8801d01a0000 RIP: 0010:[] [] __ipv6_prefix_equal64_half include/net/ipv6.h:475 [inline] RIP: 0010:[] [] ipv6_prefix_equal include/net/ipv6.h:492 [inline] RIP: 0010:[] [] fib6_locate_1 net/ipv6/ip6_fib.c:1210 [inline] RIP: 0010:[] [] fib6_locate+0x281/0x3c0 net/ipv6/ip6_fib.c:1233 RSP: 0018:ffff8801d01a36a8 EFLAGS: 00010202 RAX: 0000000000000020 RBX: ffff8801bc790e00 RCX: ffffc90002983000 RDX: 0000000000001219 RSI: ffff8801d01a37a0 RDI: 0000000000000100 RBP: ffff8801d01a36f0 R08: 00000000000000ff R09: 0000000000000000 R10: 0000000000000003 R11: 0000000000000000 R12: 0000000000000001 R13: dffffc0000000000 R14: ffff8801d01a37a0 R15: 0000000000000000 FS: 00007f6afd68c700(0000) GS:ffff8801db400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004c6340 CR3: 00000000ba41f000 CR4: 00000000001426f0 DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 Stack: ffff8801d01a37a8 ffff8801d01a3780 ffffed003a0346f5 0000000c82a23ea0 ffff8800b7bd7700 ffff8801d01a3780 ffff8800b6a1c940 ffffffff82a23ea0 ffff8801d01a3920 ffff8801d01a3748 ffffffff82a223d6 ffff8801d7d64988 Call Trace: [] ip6_route_del+0x106/0x570 net/ipv6/route.c:2109 [] inet6_rtm_delroute+0xfd/0x100 net/ipv6/route.c:3075 [] rtnetlink_rcv_msg+0x549/0x7a0 net/core/rtnetlink.c:3450 [] netlink_rcv_skb+0x141/0x370 net/netlink/af_netlink.c:2281 [] rtnetlink_rcv+0x2f/0x40 net/core/rtnetlink.c:3456 [] netlink_unicast_kernel net/netlink/af_netlink.c:1206 [inline] [] netlink_unicast+0x518/0x750 net/netlink/af_netlink.c:1232 [] netlink_sendmsg+0x8ce/0xc30 net/netlink/af_netlink.c:1778 [] sock_sendmsg_nosec net/socket.c:609 [inline] [] sock_sendmsg+0xcf/0x110 net/socket.c:619 [] sock_write_iter+0x222/0x3a0 net/socket.c:834 [] new_sync_write+0x1dd/0x2b0 fs/read_write.c:478 [] __vfs_write+0xe4/0x110 fs/read_write.c:491 [] vfs_write+0x178/0x4b0 fs/read_write.c:538 [] SYSC_write fs/read_write.c:585 [inline] [] SyS_write+0xd9/0x1b0 fs/read_write.c:577 [] entry_SYSCALL_64_fastpath+0x12/0x17 Note: there is no "Fixes" tag as this seems to be a bug introduced very early. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_fib.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index bd36ecc8d15c..aad8cdf15472 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -996,7 +996,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, /* Create subtree root node */ sfn = node_alloc(); if (!sfn) - goto st_failure; + goto failure; sfn->leaf = info->nl_net->ipv6.ip6_null_entry; atomic_inc(&info->nl_net->ipv6.ip6_null_entry->rt6i_ref); @@ -1012,12 +1012,12 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { /* If it is failed, discard just allocated - root, and then (in st_failure) stale node + root, and then (in failure) stale node in main tree. */ node_free(sfn); err = PTR_ERR(sn); - goto st_failure; + goto failure; } /* Now link new subtree to main tree */ @@ -1031,7 +1031,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, if (IS_ERR(sn)) { err = PTR_ERR(sn); - goto st_failure; + goto failure; } } @@ -1073,22 +1073,22 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, atomic_inc(&pn->leaf->rt6i_ref); } #endif - if (!(rt->dst.flags & DST_NOCACHE)) - dst_free(&rt->dst); + goto failure; } return err; -#ifdef CONFIG_IPV6_SUBTREES - /* Subtree creation failed, probably main tree node - is orphan. If it is, shoot it. +failure: + /* fn->leaf could be NULL if fn is an intermediate node and we + * failed to add the new route to it in both subtree creation + * failure and fib6_add_rt2node() failure case. + * In both cases, fib6_repair_tree() should be called to fix + * fn->leaf. */ -st_failure: if (fn && !(fn->fn_flags & (RTN_RTINFO|RTN_ROOT))) fib6_repair_tree(info->nl_net, fn); if (!(rt->dst.flags & DST_NOCACHE)) dst_free(&rt->dst); return err; -#endif } /* From 4e39b7409f3b852ae95e580207b4aec93965834c Mon Sep 17 00:00:00 2001 From: Neal Cardwell Date: Wed, 16 Aug 2017 17:53:36 -0400 Subject: [PATCH 032/546] tcp: when rearming RTO, if RTO time is in past then fire RTO ASAP [ Upstream commit cdbeb633ca71a02b7b63bfeb94994bf4e1a0b894 ] In some situations tcp_send_loss_probe() can realize that it's unable to send a loss probe (TLP), and falls back to calling tcp_rearm_rto() to schedule an RTO timer. In such cases, sometimes tcp_rearm_rto() realizes that the RTO was eligible to fire immediately or at some point in the past (delta_us <= 0). Previously in such cases tcp_rearm_rto() was scheduling such "overdue" RTOs to happen at now + icsk_rto, which caused needless delays of hundreds of milliseconds (and non-linear behavior that made reproducible testing difficult). This commit changes the logic to schedule "overdue" RTOs ASAP, rather than at now + icsk_rto. Fixes: 6ba8a3b19e76 ("tcp: Tail loss probe (TLP)") Suggested-by: Yuchung Cheng Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp_input.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index f0dabd125c43..c4bbf704ff9c 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -3028,8 +3028,7 @@ void tcp_rearm_rto(struct sock *sk) /* delta may not be positive if the socket is locked * when the retrans timer fires and is rescheduled. */ - if (delta > 0) - rto = delta; + rto = max(delta, 1); } inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, rto, TCP_RTO_MAX); From eece6c91dd33da40509e300b4da75f9c1f989269 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 17 Aug 2017 23:14:58 +0100 Subject: [PATCH 033/546] irda: do not leak initialized list.dev to userspace [ Upstream commit b024d949a3c24255a7ef1a470420eb478949aa4c ] list.dev has not been initialized and so the copy_to_user is copying data from the stack back to user space which is a potential information leak. Fix this ensuring all of list is initialized to zero. Detected by CoverityScan, CID#1357894 ("Uninitialized scalar variable") Signed-off-by: Colin Ian King Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/irda/af_irda.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c index 8d2f7c9b491d..4a116d766c15 100644 --- a/net/irda/af_irda.c +++ b/net/irda/af_irda.c @@ -2227,7 +2227,7 @@ static int irda_getsockopt(struct socket *sock, int level, int optname, { struct sock *sk = sock->sk; struct irda_sock *self = irda_sk(sk); - struct irda_device_list list; + struct irda_device_list list = { 0 }; struct irda_device_info *discoveries; struct irda_ias_set * ias_opt; /* IAS get/query params */ struct ias_object * ias_obj; /* Object in IAS */ From 248af6aa226c5e3d503a26e109db944a1cabdb48 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Fri, 18 Aug 2017 11:01:36 +0800 Subject: [PATCH 034/546] net: sched: fix NULL pointer dereference when action calls some targets [ Upstream commit 4f8a881acc9d1adaf1e552349a0b1df28933a04c ] As we know in some target's checkentry it may dereference par.entryinfo to check entry stuff inside. But when sched action calls xt_check_target, par.entryinfo is set with NULL. It would cause kernel panic when calling some targets. It can be reproduce with: # tc qd add dev eth1 ingress handle ffff: # tc filter add dev eth1 parent ffff: u32 match u32 0 0 action xt \ -j ECN --ecn-tcp-remove It could also crash kernel when using target CLUSTERIP or TPROXY. By now there's no proper value for par.entryinfo in ipt_init_target, but it can not be set with NULL. This patch is to void all these panics by setting it with an ipt_entry obj with all members = 0. Note that this issue has been there since the very beginning. Signed-off-by: Xin Long Acked-by: Pablo Neira Ayuso Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/sched/act_ipt.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c index 0915d448ba23..075b0d22f213 100644 --- a/net/sched/act_ipt.c +++ b/net/sched/act_ipt.c @@ -34,6 +34,7 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int { struct xt_tgchk_param par; struct xt_target *target; + struct ipt_entry e = {}; int ret = 0; target = xt_request_find_target(AF_INET, t->u.user.name, @@ -44,6 +45,7 @@ static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int t->u.kernel.target = target; memset(&par, 0, sizeof(par)); par.table = table; + par.entryinfo = &e; par.target = target; par.targinfo = t->data; par.hook_mask = hook; From 58079f56b30227c37709e9db7e9b51204e81f4ea Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Sat, 19 Aug 2017 15:37:07 +0300 Subject: [PATCH 035/546] net_sched: fix order of queue length updates in qdisc_replace() [ Upstream commit 68a66d149a8c78ec6720f268597302883e48e9fa ] This important to call qdisc_tree_reduce_backlog() after changing queue length. Parent qdisc should deactivate class in ->qlen_notify() called from qdisc_tree_reduce_backlog() but this happens only if qdisc->q.qlen in zero. Missed class deactivations leads to crashes/warnings at picking packets from empty qdisc and corrupting state at reactivating this class in future. Signed-off-by: Konstantin Khlebnikov Fixes: 86a7996cc8a0 ("net_sched: introduce qdisc_replace() helper") Acked-by: Cong Wang Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/sch_generic.h | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index e5bba897d206..7a5d6a073165 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -717,8 +717,11 @@ static inline struct Qdisc *qdisc_replace(struct Qdisc *sch, struct Qdisc *new, old = *pold; *pold = new; if (old != NULL) { - qdisc_tree_reduce_backlog(old, old->q.qlen, old->qstats.backlog); + unsigned int qlen = old->q.qlen; + unsigned int backlog = old->qstats.backlog; + qdisc_reset(old); + qdisc_tree_reduce_backlog(old, qlen, backlog); } sch_tree_unlock(sch); From 707352e687452e2d14b49fdd02ddf9b5fd6af621 Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Mon, 29 Feb 2016 22:03:23 +0200 Subject: [PATCH 036/546] mei: me: add broxton pci device ids commit dd16f6cdeb4e02a728863d3cf99aaab352f0d761 upstream. Add device ids for Broxton SoC based devices. Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me-regs.h | 3 +++ drivers/misc/mei/pci-me.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index a2661381ddfc..21e5c6b652c5 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -125,6 +125,9 @@ #define MEI_DEV_ID_KBP 0xA2BA /* Kaby Point */ #define MEI_DEV_ID_KBP_2 0xA2BB /* Kaby Point 2 */ +#define MEI_DEV_ID_BXT_M 0x1A9A /* Broxton M */ +#define MEI_DEV_ID_APL_I 0x5A9A /* Apollo Lake I */ + /* * MEI HW Section */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 01e20384ac44..9d7ed85f2744 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -90,6 +90,9 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_KBP, mei_me_pch8_cfg)}, {MEI_PCI_DEVICE(MEI_DEV_ID_KBP_2, mei_me_pch8_cfg)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_BXT_M, mei_me_pch8_cfg)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_APL_I, mei_me_pch8_cfg)}, + /* required last entry */ {0, } }; From a56800ae1c5779e31b795341cc736afa2908b55e Mon Sep 17 00:00:00 2001 From: Tomas Winkler Date: Tue, 29 Nov 2016 16:49:27 +0200 Subject: [PATCH 037/546] mei: me: add lewisburg device ids commit 9ff2007bea1f1bfc53ac0bc7ccf8200bb275fd52 upstream. Add MEI Lewisburg PCH IDs for Purley based workstations. Signed-off-by: Alexander Usyskin Signed-off-by: Tomas Winkler Signed-off-by: Greg Kroah-Hartman --- drivers/misc/mei/hw-me-regs.h | 2 ++ drivers/misc/mei/pci-me.c | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/misc/mei/hw-me-regs.h b/drivers/misc/mei/hw-me-regs.h index 21e5c6b652c5..d2774197fe58 100644 --- a/drivers/misc/mei/hw-me-regs.h +++ b/drivers/misc/mei/hw-me-regs.h @@ -125,6 +125,8 @@ #define MEI_DEV_ID_KBP 0xA2BA /* Kaby Point */ #define MEI_DEV_ID_KBP_2 0xA2BB /* Kaby Point 2 */ +#define MEI_DEV_ID_LBG 0xA1BA /* Lewisburg (SPT) */ + #define MEI_DEV_ID_BXT_M 0x1A9A /* Broxton M */ #define MEI_DEV_ID_APL_I 0x5A9A /* Apollo Lake I */ diff --git a/drivers/misc/mei/pci-me.c b/drivers/misc/mei/pci-me.c index 9d7ed85f2744..adab5bbb642a 100644 --- a/drivers/misc/mei/pci-me.c +++ b/drivers/misc/mei/pci-me.c @@ -86,6 +86,7 @@ static const struct pci_device_id mei_me_pci_tbl[] = { {MEI_PCI_DEVICE(MEI_DEV_ID_SPT_2, mei_me_pch8_cfg)}, {MEI_PCI_DEVICE(MEI_DEV_ID_SPT_H, mei_me_pch8_sps_cfg)}, {MEI_PCI_DEVICE(MEI_DEV_ID_SPT_H_2, mei_me_pch8_sps_cfg)}, + {MEI_PCI_DEVICE(MEI_DEV_ID_LBG, mei_me_pch8_cfg)}, {MEI_PCI_DEVICE(MEI_DEV_ID_KBP, mei_me_pch8_cfg)}, {MEI_PCI_DEVICE(MEI_DEV_ID_KBP_2, mei_me_pch8_cfg)}, From 5609ae96bcd6eca6c356d257aa5e7597c9e4284c Mon Sep 17 00:00:00 2001 From: Aaron Ma Date: Fri, 18 Aug 2017 12:17:21 -0700 Subject: [PATCH 038/546] Input: trackpoint - add new trackpoint firmware ID commit ec667683c532c93fb41e100e5d61a518971060e2 upstream. Synaptics add new TP firmware ID: 0x2 and 0x3, for now both lower 2 bits are indicated as TP. Change the constant to bitwise values. This makes trackpoint to be recognized on Lenovo Carbon X1 Gen5 instead of it being identified as "PS/2 Generic Mouse". Signed-off-by: Aaron Ma Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/trackpoint.c | 3 ++- drivers/input/mouse/trackpoint.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index 354d47ecd66a..ce6ff9b301bb 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -265,7 +265,8 @@ static int trackpoint_start_protocol(struct psmouse *psmouse, unsigned char *fir if (ps2_command(&psmouse->ps2dev, param, MAKE_PS2_CMD(0, 2, TP_READ_ID))) return -1; - if (param[0] != TP_MAGIC_IDENT) + /* add new TP ID. */ + if (!(param[0] & TP_MAGIC_IDENT)) return -1; if (firmware_id) diff --git a/drivers/input/mouse/trackpoint.h b/drivers/input/mouse/trackpoint.h index 5617ed3a7d7a..88055755f82e 100644 --- a/drivers/input/mouse/trackpoint.h +++ b/drivers/input/mouse/trackpoint.h @@ -21,8 +21,9 @@ #define TP_COMMAND 0xE2 /* Commands start with this */ #define TP_READ_ID 0xE1 /* Sent for device identification */ -#define TP_MAGIC_IDENT 0x01 /* Sent after a TP_READ_ID followed */ +#define TP_MAGIC_IDENT 0x03 /* Sent after a TP_READ_ID followed */ /* by the firmware ID */ + /* Firmware ID includes 0x1, 0x2, 0x3 */ /* From 07051c1754775c95306ef75730efbaa321a97523 Mon Sep 17 00:00:00 2001 From: KT Liao Date: Fri, 18 Aug 2017 16:58:15 -0700 Subject: [PATCH 039/546] Input: elan_i2c - add ELAN0602 ACPI ID to support Lenovo Yoga310 commit 1d2226e45040ed4aee95b633cbd64702bf7fc2a1 upstream. Add ELAN0602 to the list of known ACPI IDs to enable support for ELAN touchpads found in Lenovo Yoga310. Signed-off-by: KT Liao Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/elan_i2c_core.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/input/mouse/elan_i2c_core.c b/drivers/input/mouse/elan_i2c_core.c index 98d4e515587a..681dce15fbc8 100644 --- a/drivers/input/mouse/elan_i2c_core.c +++ b/drivers/input/mouse/elan_i2c_core.c @@ -1234,6 +1234,7 @@ static const struct acpi_device_id elan_acpi_id[] = { { "ELAN0000", 0 }, { "ELAN0100", 0 }, { "ELAN0600", 0 }, + { "ELAN0602", 0 }, { "ELAN0605", 0 }, { "ELAN0608", 0 }, { "ELAN0605", 0 }, From 099e57fcb03fb46ec707c9627ebc5e8b6f1ffcdc Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 22 Aug 2017 08:15:13 +0200 Subject: [PATCH 040/546] ALSA: core: Fix unexpected error at replacing user TLV commit 88c54cdf61f508ebcf8da2d819f5dfc03e954d1d upstream. When user tries to replace the user-defined control TLV, the kernel checks the change of its content via memcmp(). The problem is that the kernel passes the return value from memcmp() as is. memcmp() gives a non-zero negative value depending on the comparison result, and this shall be recognized as an error code. The patch covers that corner-case, return 1 properly for the changed TLV. Fixes: 8aa9b586e420 ("[ALSA] Control API - more robust TLV implementation") Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/control.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/core/control.c b/sound/core/control.c index b4fe9b002512..bd01d492f46a 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -1126,7 +1126,7 @@ static int snd_ctl_elem_user_tlv(struct snd_kcontrol *kcontrol, mutex_lock(&ue->card->user_ctl_lock); change = ue->tlv_data_size != size; if (!change) - change = memcmp(ue->tlv_data, new_data, size); + change = memcmp(ue->tlv_data, new_data, size) != 0; kfree(ue->tlv_data); ue->tlv_data = new_data; ue->tlv_data_size = size; From 6b1c81dd7fdbfe2f855454d0329c70222deb29d4 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 23 Aug 2017 09:30:17 +0200 Subject: [PATCH 041/546] ALSA: hda - Add stereo mic quirk for Lenovo G50-70 (17aa:3978) commit bbba6f9d3da357bbabc6fda81e99ff5584500e76 upstream. Lenovo G50-70 (17aa:3978) with Conexant codec chip requires the similar workaround for the inverted stereo dmic like other Lenovo models. Bugzilla: https://bugzilla.suse.com/show_bug.cgi?id=1020657 Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/hda/patch_conexant.c | 1 + 1 file changed, 1 insertion(+) diff --git a/sound/pci/hda/patch_conexant.c b/sound/pci/hda/patch_conexant.c index 46f7b023f69c..ac5de4365e15 100644 --- a/sound/pci/hda/patch_conexant.c +++ b/sound/pci/hda/patch_conexant.c @@ -854,6 +854,7 @@ static const struct snd_pci_quirk cxt5066_fixups[] = { SND_PCI_QUIRK(0x17aa, 0x390b, "Lenovo G50-80", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x3975, "Lenovo U300s", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x3977, "Lenovo IdeaPad U310", CXT_FIXUP_STEREO_DMIC), + SND_PCI_QUIRK(0x17aa, 0x3978, "Lenovo G50-70", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK(0x17aa, 0x397b, "Lenovo S205", CXT_FIXUP_STEREO_DMIC), SND_PCI_QUIRK_VENDOR(0x17aa, "Thinkpad", CXT_FIXUP_THINKPAD_ACPI), SND_PCI_QUIRK(0x1c06, 0x2011, "Lemote A1004", CXT_PINCFG_LEMOTE_A1004), From 10814c149eeb7f604e2ec7cff51b42267beffb38 Mon Sep 17 00:00:00 2001 From: Alexey Brodkin Date: Tue, 1 Aug 2017 12:58:47 +0300 Subject: [PATCH 042/546] ARCv2: PAE40: Explicitly set MSB counterpart of SLC region ops addresses commit 7d79cee2c6540ea64dd917a14e2fd63d4ac3d3c0 upstream. It is necessary to explicitly set both SLC_AUX_RGN_START1 and SLC_AUX_RGN_END1 which hold MSB bits of the physical address correspondingly of region start and end otherwise SLC region operation is executed in unpredictable manner Without this patch, SLC flushes on HSDK (IOC disabled) were taking seconds. Reported-by: Vladimir Kondratiev Signed-off-by: Alexey Brodkin Signed-off-by: Vineet Gupta [vgupta: PAR40 regs only written if PAE40 exist] Signed-off-by: Greg Kroah-Hartman --- arch/arc/include/asm/cache.h | 2 ++ arch/arc/mm/cache.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/arch/arc/include/asm/cache.h b/arch/arc/include/asm/cache.h index 210ef3e72332..0ddd7144c492 100644 --- a/arch/arc/include/asm/cache.h +++ b/arch/arc/include/asm/cache.h @@ -88,7 +88,9 @@ extern int ioc_exists; #define ARC_REG_SLC_FLUSH 0x904 #define ARC_REG_SLC_INVALIDATE 0x905 #define ARC_REG_SLC_RGN_START 0x914 +#define ARC_REG_SLC_RGN_START1 0x915 #define ARC_REG_SLC_RGN_END 0x916 +#define ARC_REG_SLC_RGN_END1 0x917 /* Bit val in SLC_CONTROL */ #define SLC_CTRL_IM 0x040 diff --git a/arch/arc/mm/cache.c b/arch/arc/mm/cache.c index d81b6d7e11e7..9a84cbdd44b0 100644 --- a/arch/arc/mm/cache.c +++ b/arch/arc/mm/cache.c @@ -543,6 +543,7 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) static DEFINE_SPINLOCK(lock); unsigned long flags; unsigned int ctrl; + phys_addr_t end; spin_lock_irqsave(&lock, flags); @@ -572,8 +573,16 @@ noinline void slc_op(phys_addr_t paddr, unsigned long sz, const int op) * END needs to be setup before START (latter triggers the operation) * END can't be same as START, so add (l2_line_sz - 1) to sz */ - write_aux_reg(ARC_REG_SLC_RGN_END, (paddr + sz + l2_line_sz - 1)); - write_aux_reg(ARC_REG_SLC_RGN_START, paddr); + end = paddr + sz + l2_line_sz - 1; + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_END1, upper_32_bits(end)); + + write_aux_reg(ARC_REG_SLC_RGN_END, lower_32_bits(end)); + + if (is_pae40_enabled()) + write_aux_reg(ARC_REG_SLC_RGN_START1, upper_32_bits(paddr)); + + write_aux_reg(ARC_REG_SLC_RGN_START, lower_32_bits(paddr)); while (read_aux_reg(ARC_REG_SLC_CTRL) & SLC_CTRL_BUSY); From 33e4c6378417fe0dbc8777e214be2aa35bb48901 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Wed, 9 Aug 2017 15:28:22 +0200 Subject: [PATCH 043/546] i2c: designware: Fix system suspend commit a23318feeff662c8d25d21623daebdd2e55ec221 upstream. The commit 8503ff166504 ("i2c: designware: Avoid unnecessary resuming during system suspend"), may suggest to the PM core to try out the so called direct_complete path for system sleep. In this path, the PM core treats a runtime suspended device as it's already in a proper low power state for system sleep, which makes it skip calling the system sleep callbacks for the device, except for the ->prepare() and the ->complete() callbacks. However, the PM core may unset the direct_complete flag for a parent device, in case its child device are being system suspended before. In this scenario, the PM core invokes the system sleep callbacks, no matter if the device is runtime suspended or not. Particularly in cases of an existing i2c slave device, the above path is triggered, which breaks the assumption that the i2c device is always runtime resumed whenever the dw_i2c_plat_suspend() is being called. More precisely, dw_i2c_plat_suspend() calls clk_core_disable() and clk_core_unprepare(), for an already disabled/unprepared clock, leading to a splat in the log about clocks calls being wrongly balanced and breaking system sleep. To still allow the direct_complete path in cases when it's possible, but also to keep the fix simple, let's runtime resume the i2c device in the ->suspend() callback, before continuing to put the device into low power state. Note, in cases when the i2c device is attached to the ACPI PM domain, this problem doesn't occur, because ACPI's ->suspend() callback, assigned to acpi_subsys_suspend(), already calls pm_runtime_resume() for the device. It should also be noted that this change does not fix commit 8503ff166504 ("i2c: designware: Avoid unnecessary resuming during system suspend"). Because for the non-ACPI case, the system sleep support was already broken prior that point. Signed-off-by: Ulf Hansson Acked-by: Rafael J. Wysocki Tested-by: John Stultz Tested-by: Jarkko Nikula Acked-by: Jarkko Nikula Reviewed-by: Mika Westerberg Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-designware-platdrv.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-designware-platdrv.c b/drivers/i2c/busses/i2c-designware-platdrv.c index 6b00061c3746..a2ae2213ef3e 100644 --- a/drivers/i2c/busses/i2c-designware-platdrv.c +++ b/drivers/i2c/busses/i2c-designware-platdrv.c @@ -294,7 +294,7 @@ static void dw_i2c_plat_complete(struct device *dev) #endif #ifdef CONFIG_PM -static int dw_i2c_plat_suspend(struct device *dev) +static int dw_i2c_plat_runtime_suspend(struct device *dev) { struct platform_device *pdev = to_platform_device(dev); struct dw_i2c_dev *i_dev = platform_get_drvdata(pdev); @@ -318,11 +318,21 @@ static int dw_i2c_plat_resume(struct device *dev) return 0; } +#ifdef CONFIG_PM_SLEEP +static int dw_i2c_plat_suspend(struct device *dev) +{ + pm_runtime_resume(dev); + return dw_i2c_plat_runtime_suspend(dev); +} +#endif + static const struct dev_pm_ops dw_i2c_dev_pm_ops = { .prepare = dw_i2c_plat_prepare, .complete = dw_i2c_plat_complete, SET_SYSTEM_SLEEP_PM_OPS(dw_i2c_plat_suspend, dw_i2c_plat_resume) - SET_RUNTIME_PM_OPS(dw_i2c_plat_suspend, dw_i2c_plat_resume, NULL) + SET_RUNTIME_PM_OPS(dw_i2c_plat_runtime_suspend, + dw_i2c_plat_resume, + NULL) }; #define DW_I2C_DEV_PMOPS (&dw_i2c_dev_pm_ops) From 2a9d7664ffb2c223c488058ee6bee61512db9396 Mon Sep 17 00:00:00 2001 From: Chris Wilson Date: Sat, 19 Aug 2017 13:05:58 +0100 Subject: [PATCH 044/546] drm: Release driver tracking before making the object available again MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit fe4600a548f2763dec91b3b27a1245c370ceee2a upstream. This is the same bug as we fixed in commit f6cd7daecff5 ("drm: Release driver references to handle before making it available again"), but now the exposure is via the PRIME lookup tables. If we remove the object/handle from the PRIME lut, then a new request for the same object/fd will generate a new handle, thus for a short window that object is known to userspace by two different handles. Fix this by releasing the driver tracking before PRIME. Fixes: 0ff926c7d4f0 ("drm/prime: add exported buffers to current fprivs imported buffer list (v2)") Signed-off-by: Chris Wilson Cc: David Airlie Cc: Daniel Vetter Cc: Rob Clark Cc: Ville Syrjälä Cc: Thierry Reding Reviewed-by: Daniel Vetter Signed-off-by: Joonas Lahtinen Link: https://patchwork.freedesktop.org/patch/msgid/20170819120558.6465-1-chris@chris-wilson.co.uk Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_gem.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/drm_gem.c b/drivers/gpu/drm/drm_gem.c index b205224f1a44..9147113139be 100644 --- a/drivers/gpu/drm/drm_gem.c +++ b/drivers/gpu/drm/drm_gem.c @@ -715,13 +715,13 @@ drm_gem_object_release_handle(int id, void *ptr, void *data) struct drm_gem_object *obj = ptr; struct drm_device *dev = obj->dev; + if (dev->driver->gem_close_object) + dev->driver->gem_close_object(obj, file_priv); + if (drm_core_check_feature(dev, DRIVER_PRIME)) drm_gem_remove_prime_handles(obj, file_priv); drm_vma_node_revoke(&obj->vma_node, file_priv->filp); - if (dev->driver->gem_close_object) - dev->driver->gem_close_object(obj, file_priv); - drm_gem_object_handle_unreference_unlocked(obj); return 0; From 3416ee45a8cbeb5bc4b13a754873fbb26a27dccb Mon Sep 17 00:00:00 2001 From: Maarten Lankhorst Date: Tue, 15 Aug 2017 11:57:06 +0200 Subject: [PATCH 045/546] drm/atomic: If the atomic check fails, return its value first commit a0ffc51e20e90e0c1c2491de2b4b03f48b6caaba upstream. The last part of drm_atomic_check_only is testing whether we need to fail with -EINVAL when modeset is not allowed, but forgets to return the value when atomic_check() fails first. This results in -EDEADLK being replaced by -EINVAL, and the sanity check in drm_modeset_drop_locks kicks in: [ 308.531734] ------------[ cut here ]------------ [ 308.531791] WARNING: CPU: 0 PID: 1886 at drivers/gpu/drm/drm_modeset_lock.c:217 drm_modeset_drop_locks+0x33/0xc0 [drm] [ 308.531828] Modules linked in: [ 308.532050] CPU: 0 PID: 1886 Comm: kms_atomic Tainted: G U W 4.13.0-rc5-patser+ #5225 [ 308.532082] Hardware name: NUC5i7RYB, BIOS RYBDWi35.86A.0246.2015.0309.1355 03/09/2015 [ 308.532124] task: ffff8800cd9dae00 task.stack: ffff8800ca3b8000 [ 308.532168] RIP: 0010:drm_modeset_drop_locks+0x33/0xc0 [drm] [ 308.532189] RSP: 0018:ffff8800ca3bf980 EFLAGS: 00010282 [ 308.532211] RAX: dffffc0000000000 RBX: ffff8800ca3bfaf8 RCX: 0000000013a171e6 [ 308.532235] RDX: 1ffff10019477f69 RSI: ffffffffa8ba4fa0 RDI: ffff8800ca3bfb48 [ 308.532258] RBP: ffff8800ca3bf998 R08: 0000000000000000 R09: 0000000000000003 [ 308.532281] R10: 0000000079dbe066 R11: 00000000f760b34b R12: 0000000000000001 [ 308.532304] R13: dffffc0000000000 R14: 00000000ffffffea R15: ffff880096889680 [ 308.532328] FS: 00007ff00959cec0(0000) GS:ffff8800d4e00000(0000) knlGS:0000000000000000 [ 308.532359] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 308.532380] CR2: 0000000000000008 CR3: 00000000ca2e3000 CR4: 00000000003406f0 [ 308.532402] Call Trace: [ 308.532440] drm_mode_atomic_ioctl+0x19fa/0x1c00 [drm] [ 308.532488] ? drm_atomic_set_property+0x1220/0x1220 [drm] [ 308.532565] ? avc_has_extended_perms+0xc39/0xff0 [ 308.532593] ? lock_downgrade+0x610/0x610 [ 308.532640] ? drm_atomic_set_property+0x1220/0x1220 [drm] [ 308.532680] drm_ioctl_kernel+0x154/0x1a0 [drm] [ 308.532755] drm_ioctl+0x624/0x8f0 [drm] [ 308.532858] ? drm_atomic_set_property+0x1220/0x1220 [drm] [ 308.532976] ? drm_getunique+0x210/0x210 [drm] [ 308.533061] do_vfs_ioctl+0xd92/0xe40 [ 308.533121] ? ioctl_preallocate+0x1b0/0x1b0 [ 308.533160] ? selinux_capable+0x20/0x20 [ 308.533191] ? do_fcntl+0x1b1/0xbf0 [ 308.533219] ? kasan_slab_free+0xa2/0xb0 [ 308.533249] ? f_getown+0x4b/0xa0 [ 308.533278] ? putname+0xcf/0xe0 [ 308.533309] ? security_file_ioctl+0x57/0x90 [ 308.533342] SyS_ioctl+0x4e/0x80 [ 308.533374] entry_SYSCALL_64_fastpath+0x18/0xad [ 308.533405] RIP: 0033:0x7ff00779e4d7 [ 308.533431] RSP: 002b:00007fff66a043d8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [ 308.533481] RAX: ffffffffffffffda RBX: 000000e7c7ca5910 RCX: 00007ff00779e4d7 [ 308.533560] RDX: 00007fff66a04430 RSI: 00000000c03864bc RDI: 0000000000000003 [ 308.533608] RBP: 00007ff007a5fb00 R08: 000000e7c7ca4620 R09: 000000e7c7ca5e60 [ 308.533647] R10: 0000000000000001 R11: 0000000000000246 R12: 0000000000000070 [ 308.533685] R13: 0000000000000000 R14: 0000000000000000 R15: 000000e7c7ca5930 [ 308.533770] Code: ff df 55 48 89 e5 41 55 41 54 53 48 89 fb 48 83 c7 50 48 89 fa 48 c1 ea 03 80 3c 02 00 74 05 e8 94 d4 16 e7 48 83 7b 50 00 74 02 <0f> ff 4c 8d 6b 58 48 b8 00 00 00 00 00 fc ff df 4c 89 ea 48 c1 [ 308.534086] ---[ end trace 77f11e53b1df44ad ]--- Solve this by adding the missing return. This is also a bugfix because we could end up rejecting updates with -EINVAL because of a early -EDEADLK, while if atomic_check ran to completion it might have downgraded the modeset to a fastset. Signed-off-by: Maarten Lankhorst Testcase: kms_atomic Link: https://patchwork.freedesktop.org/patch/msgid/20170815095706.23624-1-maarten.lankhorst@linux.intel.com Fixes: d34f20d6e2f2 ("drm: Atomic modeset ioctl") Reviewed-by: Daniel Vetter Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/drm_atomic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/drm_atomic.c b/drivers/gpu/drm/drm_atomic.c index 6253775b8d9c..50d74e5ce41b 100644 --- a/drivers/gpu/drm/drm_atomic.c +++ b/drivers/gpu/drm/drm_atomic.c @@ -1247,6 +1247,9 @@ int drm_atomic_check_only(struct drm_atomic_state *state) if (config->funcs->atomic_check) ret = config->funcs->atomic_check(state->dev, state); + if (ret) + return ret; + if (!state->allow_modeset) { for_each_crtc_in_state(state, crtc, crtc_state, i) { if (drm_atomic_crtc_needs_modeset(crtc_state)) { @@ -1257,7 +1260,7 @@ int drm_atomic_check_only(struct drm_atomic_state *state) } } - return ret; + return 0; } EXPORT_SYMBOL(drm_atomic_check_only); From 2b60c153ff3d922e0be3fa9dbe3d53fbc8c78cdb Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 7 Sep 2015 16:03:25 +0300 Subject: [PATCH 046/546] drm: rcar-du: lvds: Fix PLL frequency-related configuration commit 5e1ac3bdc6bbb4f378251b87625b8acfbfc4ae82 upstream. The frequency checks don't match the datasheet, fix them. Signed-off-by: Laurent Pinchart Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c b/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c index 85043c5bad03..5188972a6a02 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c @@ -56,11 +56,11 @@ static int rcar_du_lvdsenc_start(struct rcar_du_lvdsenc *lvds, return ret; /* PLL clock configuration */ - if (freq <= 38000) + if (freq < 39000) pllcr = LVDPLLCR_CEEN | LVDPLLCR_COSEL | LVDPLLCR_PLLDLYCNT_38M; - else if (freq <= 60000) + else if (freq < 61000) pllcr = LVDPLLCR_CEEN | LVDPLLCR_COSEL | LVDPLLCR_PLLDLYCNT_60M; - else if (freq <= 121000) + else if (freq < 121000) pllcr = LVDPLLCR_CEEN | LVDPLLCR_COSEL | LVDPLLCR_PLLDLYCNT_121M; else pllcr = LVDPLLCR_PLLDLYCNT_150M; From 766a097cbfea9a2527d881b67b835bf223d5f79d Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 7 Sep 2015 15:28:17 +0300 Subject: [PATCH 047/546] drm: rcar-du: lvds: Rename PLLEN bit to PLLON commit 82e7c5e4964545352accff4b44bbcaa2c38e7fc1 upstream. The bit is named PLLON in the datasheet, rename it. Signed-off-by: Laurent Pinchart Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c | 2 +- drivers/gpu/drm/rcar-du/rcar_lvds_regs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c b/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c index 5188972a6a02..873e04aa9352 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_lvdsenc.c @@ -102,7 +102,7 @@ static int rcar_du_lvdsenc_start(struct rcar_du_lvdsenc *lvds, /* Turn the PLL on, wait for the startup delay, and turn the output * on. */ - lvdcr0 |= LVDCR0_PLLEN; + lvdcr0 |= LVDCR0_PLLON; rcar_lvds_write(lvds, LVDCR0, lvdcr0); usleep_range(100, 150); diff --git a/drivers/gpu/drm/rcar-du/rcar_lvds_regs.h b/drivers/gpu/drm/rcar-du/rcar_lvds_regs.h index 77cf9289ab65..b1eafd097a79 100644 --- a/drivers/gpu/drm/rcar-du/rcar_lvds_regs.h +++ b/drivers/gpu/drm/rcar-du/rcar_lvds_regs.h @@ -18,7 +18,7 @@ #define LVDCR0_DMD (1 << 12) #define LVDCR0_LVMD_MASK (0xf << 8) #define LVDCR0_LVMD_SHIFT 8 -#define LVDCR0_PLLEN (1 << 4) +#define LVDCR0_PLLON (1 << 4) #define LVDCR0_BEN (1 << 2) #define LVDCR0_LVEN (1 << 1) #define LVDCR0_LVRES (1 << 0) From fbf583912145e9096815a14f1554f44cfaab7059 Mon Sep 17 00:00:00 2001 From: Laurent Pinchart Date: Mon, 3 Oct 2016 20:03:22 +0300 Subject: [PATCH 048/546] drm: rcar-du: Fix crash in encoder failure error path commit 05ee29e94acf0d4b3998c3f93374952de8f90176 upstream. When an encoder fails to initialize the driver prints an error message to the kernel log. The message contains the name of the encoder's DT node, which is NULL for internal encoders. Use the of_node_full_name() macro to avoid dereferencing a NULL pointer, print the output number to add more context to the error, and make sure we still own a reference to the encoder's DT node by delaying the of_node_put() call. Signed-off-by: Laurent Pinchart Reviewed-by: Gustavo Padovan Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_kms.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_kms.c b/drivers/gpu/drm/rcar-du/rcar_du_kms.c index 46429c4be8e5..2b75a4891dec 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_kms.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_kms.c @@ -642,13 +642,13 @@ static int rcar_du_encoders_init_one(struct rcar_du_device *rcdu, } ret = rcar_du_encoder_init(rcdu, enc_type, output, encoder, connector); - of_node_put(encoder); - of_node_put(connector); - if (ret && ret != -EPROBE_DEFER) dev_warn(rcdu->dev, - "failed to initialize encoder %s (%d), skipping\n", - encoder->full_name, ret); + "failed to initialize encoder %s on output %u (%d), skipping\n", + of_node_full_name(encoder), output, ret); + + of_node_put(encoder); + of_node_put(connector); return ret; } From 64f3c534e7acab041e39cb2429b54d8c5141e89a Mon Sep 17 00:00:00 2001 From: Koji Matsuoka Date: Mon, 18 Apr 2016 16:31:30 +0900 Subject: [PATCH 049/546] drm: rcar-du: Fix display timing controller parameter commit 9cdced8a39c04cf798ddb2a27cb5952f7d39f633 upstream. There is a bug in the setting of the DES (Display Enable Signal) register. This current setting occurs 1 dot left shift. The DES register should be set minus one value about the specifying value with H/W specification. This patch corrects it. Signed-off-by: Koji Matsuoka Signed-off-by: Laurent Pinchart Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_crtc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c index 9255b9c096b6..c6b279ddc9c5 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c @@ -171,7 +171,7 @@ static void rcar_du_crtc_set_display_timing(struct rcar_du_crtc *rcrtc) mode->crtc_vsync_start - 1); rcar_du_crtc_write(rcrtc, VCR, mode->crtc_vtotal - 1); - rcar_du_crtc_write(rcrtc, DESR, mode->htotal - mode->hsync_start); + rcar_du_crtc_write(rcrtc, DESR, mode->htotal - mode->hsync_start - 1); rcar_du_crtc_write(rcrtc, DEWR, mode->hdisplay); } From 0d2b7767611fc1762140614452e21d558aa5e462 Mon Sep 17 00:00:00 2001 From: Koji Matsuoka Date: Mon, 16 May 2016 11:28:15 +0900 Subject: [PATCH 050/546] drm: rcar-du: Fix H/V sync signal polarity configuration commit fd1adef3bff0663c5ac31b45bc4a05fafd43d19b upstream. The VSL and HSL bits in the DSMR register set the corresponding horizontal and vertical sync signal polarity to active high. The code got it the wrong way around, fix it. Signed-off-by: Koji Matsuoka Signed-off-by: Laurent Pinchart Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/rcar-du/rcar_du_crtc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c index c6b279ddc9c5..9befd624a5f0 100644 --- a/drivers/gpu/drm/rcar-du/rcar_du_crtc.c +++ b/drivers/gpu/drm/rcar-du/rcar_du_crtc.c @@ -148,8 +148,8 @@ static void rcar_du_crtc_set_display_timing(struct rcar_du_crtc *rcrtc) rcar_du_group_write(rcrtc->group, rcrtc->index % 2 ? OTAR2 : OTAR, 0); /* Signal polarities */ - value = ((mode->flags & DRM_MODE_FLAG_PVSYNC) ? 0 : DSMR_VSL) - | ((mode->flags & DRM_MODE_FLAG_PHSYNC) ? 0 : DSMR_HSL) + value = ((mode->flags & DRM_MODE_FLAG_PVSYNC) ? DSMR_VSL : 0) + | ((mode->flags & DRM_MODE_FLAG_PHSYNC) ? DSMR_HSL : 0) | DSMR_DIPM_DE | DSMR_CSPM; rcar_du_crtc_write(rcrtc, DSMR, value); From 9f57741b44babe107b51fe8252e90e8d7bfb6003 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Wed, 23 Aug 2017 12:46:27 -0400 Subject: [PATCH 051/546] tracing: Fix freeing of filter in create_filter() when set_str is false commit 8b0db1a5bdfcee0dbfa89607672598ae203c9045 upstream. Performing the following task with kmemleak enabled: # cd /sys/kernel/tracing/events/irq/irq_handler_entry/ # echo 'enable_event:kmem:kmalloc:3 if irq >' > trigger # echo 'enable_event:kmem:kmalloc:3 if irq > 31' > trigger # echo scan > /sys/kernel/debug/kmemleak # cat /sys/kernel/debug/kmemleak unreferenced object 0xffff8800b9290308 (size 32): comm "bash", pid 1114, jiffies 4294848451 (age 141.139s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [] kmemleak_alloc+0x4a/0xa0 [] kmem_cache_alloc_trace+0x158/0x290 [] create_filter_start.constprop.28+0x99/0x940 [] create_filter+0xa9/0x160 [] create_event_filter+0xc/0x10 [] set_trigger_filter+0xe5/0x210 [] event_enable_trigger_func+0x324/0x490 [] event_trigger_write+0x1a2/0x260 [] __vfs_write+0xd7/0x380 [] vfs_write+0x101/0x260 [] SyS_write+0xab/0x130 [] entry_SYSCALL_64_fastpath+0x1f/0xbe [] 0xffffffffffffffff The function create_filter() is passed a 'filterp' pointer that gets allocated, and if "set_str" is true, it is up to the caller to free it, even on error. The problem is that the pointer is not freed by create_filter() when set_str is false. This is a bug, and it is not up to the caller to free the filter on error if it doesn't care about the string. Link: http://lkml.kernel.org/r/1502705898-27571-2-git-send-email-chuhu@redhat.com Fixes: 38b78eb85 ("tracing: Factorize filter creation") Reported-by: Chunyu Hu Tested-by: Chunyu Hu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_events_filter.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/trace/trace_events_filter.c b/kernel/trace/trace_events_filter.c index 6816302542b2..f0e5408499b6 100644 --- a/kernel/trace/trace_events_filter.c +++ b/kernel/trace/trace_events_filter.c @@ -1979,6 +1979,10 @@ static int create_filter(struct trace_event_call *call, if (err && set_str) append_filter_err(ps, filter); } + if (err && !set_str) { + free_event_filter(filter); + filter = NULL; + } create_filter_finish(ps); *filterp = filter; From 210b41b4971e7e0e368fef790eca2e4f410d9b07 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Thu, 3 Aug 2017 13:09:03 +0530 Subject: [PATCH 052/546] cifs: Fix df output for users with quota limits commit 42bec214d8bd432be6d32a1acb0a9079ecd4d142 upstream. The df for a SMB2 share triggers a GetInfo call for FS_FULL_SIZE_INFORMATION. The values returned are used to populate struct statfs. The problem is that none of the information returned by the call contains the total blocks available on the filesystem. Instead we use the blocks available to the user ie. quota limitation when filling out statfs.f_blocks. The information returned does contain Actual free units on the filesystem and is used to populate statfs.f_bfree. For users with quota enabled, it can lead to situations where the total free space reported is more than the total blocks on the system ending up with df reports like the following # df -h /mnt/a Filesystem Size Used Avail Use% Mounted on //192.168.22.10/a 2.5G -2.3G 2.5G - /mnt/a To fix this problem, we instead populate both statfs.f_bfree with the same value as statfs.f_bavail ie. CallerAvailableAllocationUnits. This is similar to what is done already in the code for cifs and df now reports the quota information for the user used to mount the share. # df --si /mnt/a Filesystem Size Used Avail Use% Mounted on //192.168.22.10/a 2.7G 101M 2.6G 4% /mnt/a Signed-off-by: Sachin Prabhu Signed-off-by: Pierguido Lambri Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index f4afa3b1cc56..6c484ddf26a9 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -2768,8 +2768,8 @@ copy_fs_info_to_kstatfs(struct smb2_fs_full_size_info *pfs_inf, kst->f_bsize = le32_to_cpu(pfs_inf->BytesPerSector) * le32_to_cpu(pfs_inf->SectorsPerAllocationUnit); kst->f_blocks = le64_to_cpu(pfs_inf->TotalAllocationUnits); - kst->f_bfree = le64_to_cpu(pfs_inf->ActualAvailableAllocationUnits); - kst->f_bavail = le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); + kst->f_bfree = kst->f_bavail = + le64_to_cpu(pfs_inf->CallerAvailableAllocationUnits); return; } From e6066962ca46d6aca02e061dc266816cd6d72e2d Mon Sep 17 00:00:00 2001 From: Ronnie Sahlberg Date: Wed, 23 Aug 2017 14:48:14 +1000 Subject: [PATCH 053/546] cifs: return ENAMETOOLONG for overlong names in cifs_open()/cifs_lookup() commit d3edede29f74d335f81d95a4588f5f136a9f7dcf upstream. Add checking for the path component length and verify it is <= the maximum that the server advertizes via FileFsAttributeInformation. With this patch cifs.ko will now return ENAMETOOLONG instead of ENOENT when users to access an overlong path. To test this, try to cd into a (non-existing) directory on a CIFS share that has a too long name: cd /mnt/aaaaaaaaaaaaaaa... and it now should show a good error message from the shell: bash: cd: /mnt/aaaaaaaaaaaaaaaa...aaaaaa: File name too long rh bz 1153996 Signed-off-by: Ronnie Sahlberg Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/dir.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index 26a3b389a265..fa8df3fef6fc 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -183,15 +183,20 @@ build_path_from_dentry(struct dentry *direntry) } /* + * Don't allow path components longer than the server max. * Don't allow the separator character in a path component. * The VFS will not allow "/", but "\" is allowed by posix. */ static int -check_name(struct dentry *direntry) +check_name(struct dentry *direntry, struct cifs_tcon *tcon) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); int i; + if (unlikely(direntry->d_name.len > + tcon->fsAttrInfo.MaxPathNameComponentLength)) + return -ENAMETOOLONG; + if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { for (i = 0; i < direntry->d_name.len; i++) { if (direntry->d_name.name[i] == '\\') { @@ -489,10 +494,6 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, return finish_no_open(file, res); } - rc = check_name(direntry); - if (rc) - return rc; - xid = get_xid(); cifs_dbg(FYI, "parent inode = 0x%p name is: %pd and dentry = 0x%p\n", @@ -505,6 +506,11 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, } tcon = tlink_tcon(tlink); + + rc = check_name(direntry, tcon); + if (rc) + goto out_free_xid; + server = tcon->ses->server; if (server->ops->new_lease_key) @@ -765,7 +771,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } pTcon = tlink_tcon(tlink); - rc = check_name(direntry); + rc = check_name(direntry, pTcon); if (rc) goto lookup_out; From 87ac57ff972ae029baf54c6766ae1bc55e873854 Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Fri, 18 Aug 2017 11:12:19 -0400 Subject: [PATCH 054/546] nfsd: Limit end of page list when decoding NFSv4 WRITE commit fc788f64f1f3eb31e87d4f53bcf1ab76590d5838 upstream. When processing an NFSv4 WRITE operation, argp->end should never point past the end of the data in the final page of the page list. Otherwise, nfsd4_decode_compound can walk into uninitialized memory. More critical, nfsd4_decode_write is failing to increment argp->pagelen when it increments argp->pagelist. This can cause later xdr decoders to assume more data is available than really is, which can cause server crashes on malformed requests. Signed-off-by: Chuck Lever Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4xdr.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index 3f68a25f2169..544672b440de 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -129,7 +129,7 @@ static void next_decode_page(struct nfsd4_compoundargs *argp) argp->p = page_address(argp->pagelist[0]); argp->pagelist++; if (argp->pagelen < PAGE_SIZE) { - argp->end = argp->p + (argp->pagelen>>2); + argp->end = argp->p + XDR_QUADLEN(argp->pagelen); argp->pagelen = 0; } else { argp->end = argp->p + (PAGE_SIZE>>2); @@ -1246,9 +1246,7 @@ nfsd4_decode_write(struct nfsd4_compoundargs *argp, struct nfsd4_write *write) argp->pagelen -= pages * PAGE_SIZE; len -= pages * PAGE_SIZE; - argp->p = (__be32 *)page_address(argp->pagelist[0]); - argp->pagelist++; - argp->end = argp->p + XDR_QUADLEN(PAGE_SIZE); + next_decode_page(argp); } argp->p += XDR_QUADLEN(len); From 708d19eaf303065d72d6cbdc0a937a5be02cc9c1 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Thu, 22 Jun 2017 15:41:38 +0100 Subject: [PATCH 055/546] perf/core: Fix group {cpu,task} validation commit 64aee2a965cf2954a038b5522f11d2cd2f0f8f3e upstream. Regardless of which events form a group, it does not make sense for the events to target different tasks and/or CPUs, as this leaves the group inconsistent and impossible to schedule. The core perf code assumes that these are consistent across (successfully intialised) groups. Core perf code only verifies this when moving SW events into a HW context. Thus, we can violate this requirement for pure SW groups and pure HW groups, unless the relevant PMU driver happens to perform this verification itself. These mismatched groups subsequently wreak havoc elsewhere. For example, we handle watchpoints as SW events, and reserve watchpoint HW on a per-CPU basis at pmu::event_init() time to ensure that any event that is initialised is guaranteed to have a slot at pmu::add() time. However, the core code only checks the group leader's cpu filter (via event_filter_match()), and can thus install follower events onto CPUs violating thier (mismatched) CPU filters, potentially installing them into a CPU without sufficient reserved slots. This can be triggered with the below test case, resulting in warnings from arch backends. #define _GNU_SOURCE #include #include #include #include #include #include #include static int perf_event_open(struct perf_event_attr *attr, pid_t pid, int cpu, int group_fd, unsigned long flags) { return syscall(__NR_perf_event_open, attr, pid, cpu, group_fd, flags); } char watched_char; struct perf_event_attr wp_attr = { .type = PERF_TYPE_BREAKPOINT, .bp_type = HW_BREAKPOINT_RW, .bp_addr = (unsigned long)&watched_char, .bp_len = 1, .size = sizeof(wp_attr), }; int main(int argc, char *argv[]) { int leader, ret; cpu_set_t cpus; /* * Force use of CPU0 to ensure our CPU0-bound events get scheduled. */ CPU_ZERO(&cpus); CPU_SET(0, &cpus); ret = sched_setaffinity(0, sizeof(cpus), &cpus); if (ret) { printf("Unable to set cpu affinity\n"); return 1; } /* open leader event, bound to this task, CPU0 only */ leader = perf_event_open(&wp_attr, 0, 0, -1, 0); if (leader < 0) { printf("Couldn't open leader: %d\n", leader); return 1; } /* * Open a follower event that is bound to the same task, but a * different CPU. This means that the group should never be possible to * schedule. */ ret = perf_event_open(&wp_attr, 0, 1, leader, 0); if (ret < 0) { printf("Couldn't open mismatched follower: %d\n", ret); return 1; } else { printf("Opened leader/follower with mismastched CPUs\n"); } /* * Open as many independent events as we can, all bound to the same * task, CPU0 only. */ do { ret = perf_event_open(&wp_attr, 0, 0, -1, 0); } while (ret >= 0); /* * Force enable/disble all events to trigger the erronoeous * installation of the follower event. */ printf("Opened all events. Toggling..\n"); for (;;) { prctl(PR_TASK_PERF_EVENTS_DISABLE, 0, 0, 0, 0); prctl(PR_TASK_PERF_EVENTS_ENABLE, 0, 0, 0, 0); } return 0; } Fix this by validating this requirement regardless of whether we're moving events. Signed-off-by: Mark Rutland Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Zhou Chengming Link: http://lkml.kernel.org/r/1498142498-15758-1-git-send-email-mark.rutland@arm.com Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- kernel/events/core.c | 39 +++++++++++++++++++-------------------- 1 file changed, 19 insertions(+), 20 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index 784ab8fe8714..3697063dd09a 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -8473,28 +8473,27 @@ SYSCALL_DEFINE5(perf_event_open, goto err_context; /* - * Do not allow to attach to a group in a different - * task or CPU context: + * Make sure we're both events for the same CPU; + * grouping events for different CPUs is broken; since + * you can never concurrently schedule them anyhow. */ - if (move_group) { - /* - * Make sure we're both on the same task, or both - * per-cpu events. - */ - if (group_leader->ctx->task != ctx->task) - goto err_context; + if (group_leader->cpu != event->cpu) + goto err_context; - /* - * Make sure we're both events for the same CPU; - * grouping events for different CPUs is broken; since - * you can never concurrently schedule them anyhow. - */ - if (group_leader->cpu != event->cpu) - goto err_context; - } else { - if (group_leader->ctx != ctx) - goto err_context; - } + /* + * Make sure we're both on the same task, or both + * per-CPU events. + */ + if (group_leader->ctx->task != ctx->task) + goto err_context; + + /* + * Do not allow to attach to a group in a different task + * or CPU context. If we're moving SW events, we'll fix + * this up later, so allow that. + */ + if (!move_group && group_leader->ctx != ctx) + goto err_context; /* * Only a group leader can be exclusive or pinned From 172bbb8ee44a5a0f48fe4f06b09ec9bd7c1678f9 Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Tue, 27 Jun 2017 17:34:44 +0800 Subject: [PATCH 056/546] Bluetooth: hidp: fix possible might sleep error in hidp_session_thread commit 5da8e47d849d3d37b14129f038782a095b9ad049 upstream. It looks like hidp_session_thread has same pattern as the issue reported in old rfcomm: while (1) { set_current_state(TASK_INTERRUPTIBLE); if (condition) break; // may call might_sleep here schedule(); } __set_current_state(TASK_RUNNING); Which fixed at: dfb2fae Bluetooth: Fix nested sleeps So let's fix it at the same way, also follow the suggestion of: https://lwn.net/Articles/628628/ Signed-off-by: Jeffy Chen Tested-by: AL Yu-Chen Cho Tested-by: Rohit Vaswani Signed-off-by: Marcel Holtmann Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/hidp/core.c | 33 ++++++++++++++++++++++----------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/net/bluetooth/hidp/core.c b/net/bluetooth/hidp/core.c index 0bec4588c3c8..1fc076420d1e 100644 --- a/net/bluetooth/hidp/core.c +++ b/net/bluetooth/hidp/core.c @@ -36,6 +36,7 @@ #define VERSION "1.2" static DECLARE_RWSEM(hidp_session_sem); +static DECLARE_WAIT_QUEUE_HEAD(hidp_session_wq); static LIST_HEAD(hidp_session_list); static unsigned char hidp_keycode[256] = { @@ -1068,12 +1069,12 @@ static int hidp_session_start_sync(struct hidp_session *session) * Wake up session thread and notify it to stop. This is asynchronous and * returns immediately. Call this whenever a runtime error occurs and you want * the session to stop. - * Note: wake_up_process() performs any necessary memory-barriers for us. + * Note: wake_up_interruptible() performs any necessary memory-barriers for us. */ static void hidp_session_terminate(struct hidp_session *session) { atomic_inc(&session->terminate); - wake_up_process(session->task); + wake_up_interruptible(&hidp_session_wq); } /* @@ -1180,7 +1181,9 @@ static void hidp_session_run(struct hidp_session *session) struct sock *ctrl_sk = session->ctrl_sock->sk; struct sock *intr_sk = session->intr_sock->sk; struct sk_buff *skb; + DEFINE_WAIT_FUNC(wait, woken_wake_function); + add_wait_queue(&hidp_session_wq, &wait); for (;;) { /* * This thread can be woken up two ways: @@ -1188,12 +1191,10 @@ static void hidp_session_run(struct hidp_session *session) * session->terminate flag and wakes this thread up. * - Via modifying the socket state of ctrl/intr_sock. This * thread is woken up by ->sk_state_changed(). - * - * Note: set_current_state() performs any necessary - * memory-barriers for us. */ - set_current_state(TASK_INTERRUPTIBLE); + /* Ensure session->terminate is updated */ + smp_mb__before_atomic(); if (atomic_read(&session->terminate)) break; @@ -1227,11 +1228,22 @@ static void hidp_session_run(struct hidp_session *session) hidp_process_transmit(session, &session->ctrl_transmit, session->ctrl_sock); - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } + remove_wait_queue(&hidp_session_wq, &wait); atomic_inc(&session->terminate); - set_current_state(TASK_RUNNING); + + /* Ensure session->terminate is updated */ + smp_mb__after_atomic(); +} + +static int hidp_session_wake_function(wait_queue_t *wait, + unsigned int mode, + int sync, void *key) +{ + wake_up_interruptible(&hidp_session_wq); + return false; } /* @@ -1244,7 +1256,8 @@ static void hidp_session_run(struct hidp_session *session) static int hidp_session_thread(void *arg) { struct hidp_session *session = arg; - wait_queue_t ctrl_wait, intr_wait; + DEFINE_WAIT_FUNC(ctrl_wait, hidp_session_wake_function); + DEFINE_WAIT_FUNC(intr_wait, hidp_session_wake_function); BT_DBG("session %p", session); @@ -1254,8 +1267,6 @@ static int hidp_session_thread(void *arg) set_user_nice(current, -15); hidp_set_timer(session); - init_waitqueue_entry(&ctrl_wait, current); - init_waitqueue_entry(&intr_wait, current); add_wait_queue(sk_sleep(session->ctrl_sock->sk), &ctrl_wait); add_wait_queue(sk_sleep(session->intr_sock->sk), &intr_wait); /* This memory barrier is paired with wq_has_sleeper(). See From f9adf422b99309894fca52767acfa1dcdc094e84 Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Tue, 27 Jun 2017 17:34:43 +0800 Subject: [PATCH 057/546] Bluetooth: cmtp: fix possible might sleep error in cmtp_session commit f06d977309d09253c744e54e75c5295ecc52b7b4 upstream. It looks like cmtp_session has same pattern as the issue reported in old rfcomm: while (1) { set_current_state(TASK_INTERRUPTIBLE); if (condition) break; // may call might_sleep here schedule(); } __set_current_state(TASK_RUNNING); Which fixed at: dfb2fae Bluetooth: Fix nested sleeps So let's fix it at the same way, also follow the suggestion of: https://lwn.net/Articles/628628/ Signed-off-by: Jeffy Chen Reviewed-by: Brian Norris Reviewed-by: AL Yu-Chen Cho Signed-off-by: Marcel Holtmann Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/cmtp/core.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c index 298ed37010e6..3a39fd523e40 100644 --- a/net/bluetooth/cmtp/core.c +++ b/net/bluetooth/cmtp/core.c @@ -281,16 +281,16 @@ static int cmtp_session(void *arg) struct cmtp_session *session = arg; struct sock *sk = session->sock->sk; struct sk_buff *skb; - wait_queue_t wait; + DEFINE_WAIT_FUNC(wait, woken_wake_function); BT_DBG("session %p", session); set_user_nice(current, -15); - init_waitqueue_entry(&wait, current); add_wait_queue(sk_sleep(sk), &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); + /* Ensure session->terminate is updated */ + smp_mb__before_atomic(); if (atomic_read(&session->terminate)) break; @@ -307,9 +307,8 @@ static int cmtp_session(void *arg) cmtp_process_transmit(session); - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); down_write(&cmtp_session_sem); @@ -394,7 +393,7 @@ int cmtp_add_connection(struct cmtp_connadd_req *req, struct socket *sock) err = cmtp_attach_device(session); if (err < 0) { atomic_inc(&session->terminate); - wake_up_process(session->task); + wake_up_interruptible(sk_sleep(session->sock->sk)); up_write(&cmtp_session_sem); return err; } @@ -432,7 +431,11 @@ int cmtp_del_connection(struct cmtp_conndel_req *req) /* Stop session thread */ atomic_inc(&session->terminate); - wake_up_process(session->task); + + /* Ensure session->terminate is updated */ + smp_mb__after_atomic(); + + wake_up_interruptible(sk_sleep(session->sock->sk)); } else err = -ENOENT; From e1c7a447883305fd757ca76950d5ef2b4155cebc Mon Sep 17 00:00:00 2001 From: Jeffy Chen Date: Tue, 27 Jun 2017 17:34:42 +0800 Subject: [PATCH 058/546] Bluetooth: bnep: fix possible might sleep error in bnep_session commit 25717382c1dd0ddced2059053e3ca5088665f7a5 upstream. It looks like bnep_session has same pattern as the issue reported in old rfcomm: while (1) { set_current_state(TASK_INTERRUPTIBLE); if (condition) break; // may call might_sleep here schedule(); } __set_current_state(TASK_RUNNING); Which fixed at: dfb2fae Bluetooth: Fix nested sleeps So let's fix it at the same way, also follow the suggestion of: https://lwn.net/Articles/628628/ Signed-off-by: Jeffy Chen Reviewed-by: Brian Norris Reviewed-by: AL Yu-Chen Cho Signed-off-by: Marcel Holtmann Cc: Jiri Slaby Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/bnep/core.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c index 1641367e54ca..69f56073b337 100644 --- a/net/bluetooth/bnep/core.c +++ b/net/bluetooth/bnep/core.c @@ -484,16 +484,16 @@ static int bnep_session(void *arg) struct net_device *dev = s->dev; struct sock *sk = s->sock->sk; struct sk_buff *skb; - wait_queue_t wait; + DEFINE_WAIT_FUNC(wait, woken_wake_function); BT_DBG(""); set_user_nice(current, -15); - init_waitqueue_entry(&wait, current); add_wait_queue(sk_sleep(sk), &wait); while (1) { - set_current_state(TASK_INTERRUPTIBLE); + /* Ensure session->terminate is updated */ + smp_mb__before_atomic(); if (atomic_read(&s->terminate)) break; @@ -515,9 +515,8 @@ static int bnep_session(void *arg) break; netif_wake_queue(dev); - schedule(); + wait_woken(&wait, TASK_INTERRUPTIBLE, MAX_SCHEDULE_TIMEOUT); } - __set_current_state(TASK_RUNNING); remove_wait_queue(sk_sleep(sk), &wait); /* Cleanup session */ @@ -663,7 +662,7 @@ int bnep_del_connection(struct bnep_conndel_req *req) s = __bnep_get_session(req->dst); if (s) { atomic_inc(&s->terminate); - wake_up_process(s->task); + wake_up_interruptible(sk_sleep(s->sock->sk)); } else err = -ENOENT; From 596b97ec2e5e24c966b9cb4aa9a9766e53ecdd43 Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Thu, 29 Jun 2017 12:01:36 -0700 Subject: [PATCH 059/546] binder: use group leader instead of open thread commit c4ea41ba195d01c9af66fb28711a16cc97caa9c5 upstream. The binder allocator assumes that the thread that called binder_open will never die for the lifetime of that proc. That thread is normally the group_leader, however it may not be. Use the group_leader instead of current. Signed-off-by: Todd Kjos Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 47ddfefe2443..76650faca2f6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2966,8 +2966,8 @@ static int binder_open(struct inode *nodp, struct file *filp) proc = kzalloc(sizeof(*proc), GFP_KERNEL); if (proc == NULL) return -ENOMEM; - get_task_struct(current); - proc->tsk = current; + get_task_struct(current->group_leader); + proc->tsk = current->group_leader; INIT_LIST_HEAD(&proc->todo); init_waitqueue_head(&proc->wait); proc->default_priority = task_nice(current); From 1792d6c17cb282fd8e5cd197a8b33cb78484eb6a Mon Sep 17 00:00:00 2001 From: Riley Andrews Date: Thu, 29 Jun 2017 12:01:37 -0700 Subject: [PATCH 060/546] binder: Use wake up hint for synchronous transactions. commit 00b40d613352c623aaae88a44e5ded7c912909d7 upstream. Use wake_up_interruptible_sync() to hint to the scheduler binder transactions are synchronous wakeups. Disable preemption while waking to avoid ping-ponging on the binder lock. Signed-off-by: Todd Kjos Signed-off-by: Omprakash Dhyade Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 76650faca2f6..74da4821b8d6 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1718,8 +1718,12 @@ static void binder_transaction(struct binder_proc *proc, list_add_tail(&t->work.entry, target_list); tcomplete->type = BINDER_WORK_TRANSACTION_COMPLETE; list_add_tail(&tcomplete->entry, &thread->todo); - if (target_wait) - wake_up_interruptible(target_wait); + if (target_wait) { + if (reply || !(t->flags & TF_ONE_WAY)) + wake_up_interruptible_sync(target_wait); + else + wake_up_interruptible(target_wait); + } return; err_get_unused_fd_failed: From 9dac44d5d4b0a7fffe04ad505e0a082e900ad767 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Fri, 28 Jul 2017 13:56:08 +0200 Subject: [PATCH 061/546] ANDROID: binder: fix proc->tsk check. commit b2a6d1b999a4c13e5997bb864694e77172d45250 upstream. Commit c4ea41ba195d ("binder: use group leader instead of open thread")' was incomplete and didn't update a check in binder_mmap(), causing all mmap() calls into the binder driver to fail. Signed-off-by: Martijn Coenen Tested-by: John Stultz Signed-off-by: Greg Kroah-Hartman --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 74da4821b8d6..5531f020e561 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2869,7 +2869,7 @@ static int binder_mmap(struct file *filp, struct vm_area_struct *vma) const char *failure_string; struct binder_buffer *buffer; - if (proc->tsk != current) + if (proc->tsk != current->group_leader) return -EINVAL; if ((vma->vm_end - vma->vm_start) > SZ_4M) From d2c072cb638d3d1a8b34e824266969ab9372bbd5 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Fri, 4 Aug 2017 01:37:27 +0300 Subject: [PATCH 062/546] iio: imu: adis16480: Fix acceleration scale factor for adis16480 commit fdd0d32eb95f135041236a6885d9006315aa9a1d upstream. According to the datasheet, the range of the acceleration is [-10 g, + 10 g], so the scale factor should be 10 instead of 5. Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/imu/adis16480.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/iio/imu/adis16480.c b/drivers/iio/imu/adis16480.c index 2485b88ee1b6..1880105cc8c4 100644 --- a/drivers/iio/imu/adis16480.c +++ b/drivers/iio/imu/adis16480.c @@ -696,7 +696,7 @@ static const struct adis16480_chip_info adis16480_chip_info[] = { .gyro_max_val = IIO_RAD_TO_DEGREE(22500), .gyro_max_scale = 450, .accel_max_val = IIO_M_S_2_TO_G(12500), - .accel_max_scale = 5, + .accel_max_scale = 10, }, [ADIS16485] = { .channels = adis16485_channels, From 1d7e8cf01e2ed35ccded43b46a01c72ea29e9f15 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Sat, 12 Aug 2017 09:09:21 -0700 Subject: [PATCH 063/546] iio: hid-sensor-trigger: Fix the race with user space powering up sensors commit f1664eaacec31035450132c46ed2915fd2b2049a upstream. It has been reported for a while that with iio-sensor-proxy service the rotation only works after one suspend/resume cycle. This required a wait in the systemd unit file to avoid race. I found a Yoga 900 where I could reproduce this. The problem scenerio is: - During sensor driver init, enable run time PM and also set a auto-suspend for 3 seconds. This result in one runtime resume. But there is a check to avoid a powerup in this sequence, but rpm is active - User space iio-sensor-proxy tries to power up the sensor. Since rpm is active it will simply return. But sensors were not actually powered up in the prior sequence, so actaully the sensors will not work - After 3 seconds the auto suspend kicks If we add a wait in systemd service file to fire iio-sensor-proxy after 3 seconds, then now everything will work as the runtime resume will actually powerup the sensor as this is a user request. To avoid this: - Remove the check to match user requested state, this will cause a brief powerup, but if the iio-sensor-proxy starts immediately it will still work as the sensors are ON. - Also move the autosuspend delay to place when user requested turn off of sensors, like after user finished raw read or buffer disable Signed-off-by: Srinivas Pandruvada Tested-by: Bastien Nocera Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/common/hid-sensors/hid-sensor-trigger.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c index 0a86ef43e781..a8db38db622e 100644 --- a/drivers/iio/common/hid-sensors/hid-sensor-trigger.c +++ b/drivers/iio/common/hid-sensors/hid-sensor-trigger.c @@ -36,8 +36,6 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state) s32 poll_value = 0; if (state) { - if (!atomic_read(&st->user_requested_state)) - return 0; if (sensor_hub_device_open(st->hsdev)) return -EIO; @@ -84,6 +82,9 @@ static int _hid_sensor_power_state(struct hid_sensor_common *st, bool state) &report_val); } + pr_debug("HID_SENSOR %s set power_state %d report_state %d\n", + st->pdev->name, state_val, report_val); + sensor_hub_get_feature(st->hsdev, st->power_state.report_id, st->power_state.index, sizeof(state_val), &state_val); @@ -107,6 +108,7 @@ int hid_sensor_power_state(struct hid_sensor_common *st, bool state) ret = pm_runtime_get_sync(&st->pdev->dev); else { pm_runtime_mark_last_busy(&st->pdev->dev); + pm_runtime_use_autosuspend(&st->pdev->dev); ret = pm_runtime_put_autosuspend(&st->pdev->dev); } if (ret < 0) { @@ -175,8 +177,6 @@ int hid_sensor_setup_trigger(struct iio_dev *indio_dev, const char *name, /* Default to 3 seconds, but can be changed from sysfs */ pm_runtime_set_autosuspend_delay(&attrb->pdev->dev, 3000); - pm_runtime_use_autosuspend(&attrb->pdev->dev); - return ret; error_unreg_trigger: iio_trigger_unregister(trig); From eb2ba09b05a6cdb521b32305890772e618f04783 Mon Sep 17 00:00:00 2001 From: Charles Milette Date: Fri, 18 Aug 2017 16:30:34 -0400 Subject: [PATCH 064/546] staging: rtl8188eu: add RNX-N150NUB support commit f299aec6ebd747298e35934cff7709c6b119ca52 upstream. Add support for USB Device Rosewill RNX-N150NUB. VendorID: 0x0bda, ProductID: 0xffef Signed-off-by: Charles Milette Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rtl8188eu/os_dep/usb_intf.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/staging/rtl8188eu/os_dep/usb_intf.c b/drivers/staging/rtl8188eu/os_dep/usb_intf.c index 02c3feef4e36..c2d2c17550a7 100644 --- a/drivers/staging/rtl8188eu/os_dep/usb_intf.c +++ b/drivers/staging/rtl8188eu/os_dep/usb_intf.c @@ -49,6 +49,7 @@ static struct usb_device_id rtw_usb_id_tbl[] = { {USB_DEVICE(0x2001, 0x3311)}, /* DLink GO-USB-N150 REV B1 */ {USB_DEVICE(0x2357, 0x010c)}, /* TP-Link TL-WN722N v2 */ {USB_DEVICE(0x0df6, 0x0076)}, /* Sitecom N150 v2 */ + {USB_DEVICE(USB_VENDER_ID_REALTEK, 0xffef)}, /* Rosewill RNX-N150NUB */ {} /* Terminating entry */ }; From bfba69dc30abfe54f4f424ad40065a6b429308a6 Mon Sep 17 00:00:00 2001 From: Aaro Koskinen Date: Sun, 24 Jan 2016 00:36:40 +0200 Subject: [PATCH 065/546] ASoC: simple-card: don't fail if sysclk setting is not supported commit ee43a1a0cd2a8f33cddfa1323a60b5cfcf865ba0 upstream. Commit e22579713ae1 ("ASoC: simple card: set cpu-dai sysclk with mclk-fs") added sysclk / SND_SOC_CLOCK_OUT setting, that makes asoc_simple_card_hw_params fail if the operation is not supported, although the intention clearly was to ignore ENOTSUPP. Fix it. The patch fixes audio playback on Kirkwood / OpenRD client, where the following errors are seen: asoc-simple-card sound: ASoC: machine hw_params failed: -524 alsa-lib: /alsa-lib-1.0.28/src/pcm/pcm_hw.c:327:(snd_pcm_hw_hw_params) SNDRV_PCM_IOCTL_HW_PARAMS failed (-524): Unknown error 524 Fixes: e22579713ae1 ("ASoC: simple card: set cpu-dai sysclk with mclk-fs") Signed-off-by: Aaro Koskinen Reviewed-by: Andrew Lunn Signed-off-by: Mark Brown Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/generic/simple-card.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/generic/simple-card.c b/sound/soc/generic/simple-card.c index 54c33204541f..ff6fcd9f92f7 100644 --- a/sound/soc/generic/simple-card.c +++ b/sound/soc/generic/simple-card.c @@ -100,7 +100,7 @@ static int asoc_simple_card_hw_params(struct snd_pcm_substream *substream, if (ret && ret != -ENOTSUPP) goto err; } - + return 0; err: return ret; } From fdc568a4224a21988ad5092959dd00b4614fbec5 Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 26 Oct 2015 08:40:59 +0000 Subject: [PATCH 066/546] ASoC: rsnd: disable SRC.out only when stop timing commit b761bf272bce6dff4d8a7ccf4385c9f3d4018094 upstream. Because SRC is connected to DMA and DMA want to keep dreq when stop timing. This patch makes SRC stop SRC.out only when stop timing. And it stops both SRC.out/SRC.in when quit timing Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/src.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/sound/soc/sh/rcar/src.c b/sound/soc/sh/rcar/src.c index 68b439ed22d7..460d29cbaaa5 100644 --- a/sound/soc/sh/rcar/src.c +++ b/sound/soc/sh/rcar/src.c @@ -691,13 +691,27 @@ static int _rsnd_src_stop_gen2(struct rsnd_mod *mod) { rsnd_src_irq_disable_gen2(mod); - rsnd_mod_write(mod, SRC_CTRL, 0); + /* + * stop SRC output only + * see rsnd_src_quit_gen2 + */ + rsnd_mod_write(mod, SRC_CTRL, 0x01); rsnd_src_error_record_gen2(mod); return rsnd_src_stop(mod); } +static int rsnd_src_quit_gen2(struct rsnd_mod *mod, + struct rsnd_dai_stream *io, + struct rsnd_priv *priv) +{ + /* stop both out/in */ + rsnd_mod_write(mod, SRC_CTRL, 0); + + return 0; +} + static void __rsnd_src_interrupt_gen2(struct rsnd_mod *mod, struct rsnd_dai_stream *io) { @@ -971,7 +985,7 @@ static struct rsnd_mod_ops rsnd_src_gen2_ops = { .probe = rsnd_src_probe_gen2, .remove = rsnd_src_remove_gen2, .init = rsnd_src_init_gen2, - .quit = rsnd_src_quit, + .quit = rsnd_src_quit_gen2, .start = rsnd_src_start_gen2, .stop = rsnd_src_stop_gen2, .hw_params = rsnd_src_hw_params, From e974777b2ecb525cfa43408ee57e22f80abacaaa Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Mon, 26 Oct 2015 08:41:53 +0000 Subject: [PATCH 067/546] ASoC: rsnd: avoid pointless loop in rsnd_mod_interrupt() commit 2daf71ad8da6cb57f919c9c876ee7e42530371df upstream. Current Renesas sound driver doesn't have 1:1 relationship between stream <-> mod because it is supporting MIX. Because of this reason rsnd_mod_interrupt() is searching correspond mod by for loop. But this loop is not needed, because each mod has own type. This patch avoid pointless loop by using mod->type. This patch is good for SSI-parent support, because stream might have 2 SSI as SSI-parent/child. SSI interrupt handler will be called twice if stream has SSI-parent without this patch. Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/core.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c index deed48ef28b8..10b93991ae97 100644 --- a/sound/soc/sh/rcar/core.c +++ b/sound/soc/sh/rcar/core.c @@ -192,19 +192,16 @@ void rsnd_mod_interrupt(struct rsnd_mod *mod, struct rsnd_priv *priv = rsnd_mod_to_priv(mod); struct rsnd_dai_stream *io; struct rsnd_dai *rdai; - int i, j; + int i; - for_each_rsnd_dai(rdai, priv, j) { + for_each_rsnd_dai(rdai, priv, i) { + io = &rdai->playback; + if (mod == io->mod[mod->type]) + callback(mod, io); - for (i = 0; i < RSND_MOD_MAX; i++) { - io = &rdai->playback; - if (mod == io->mod[i]) - callback(mod, io); - - io = &rdai->capture; - if (mod == io->mod[i]) - callback(mod, io); - } + io = &rdai->capture; + if (mod == io->mod[mod->type]) + callback(mod, io); } } From fd504621fa52c9474d03aead381fca4155c66e5d Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Wed, 28 Oct 2015 16:03:48 +0100 Subject: [PATCH 068/546] ASoC: rsnd: Add missing initialization of ADG req_rate commit 8b27418f300cafbdbbb8cfa9c29d398ed34d6723 upstream. If the "clock-frequency" DT property is not found, req_rate is used uninitialized, and the "audio_clkout" clock will be created with an arbitrary clock rate. This uninitialized kernel stack data may leak to userspace through /sys/kernel/debug/clk/clk_summary, cfr. the value in the "rate" column: clock enable_cnt prepare_cnt rate accuracy phase -------------------------------------------------------------------- audio_clkout 0 0 4001836240 0 0 Signed-off-by: Geert Uytterhoeven Acked-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/adg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sh/rcar/adg.c b/sound/soc/sh/rcar/adg.c index 2a5b3a293cd2..b123734f9fbd 100644 --- a/sound/soc/sh/rcar/adg.c +++ b/sound/soc/sh/rcar/adg.c @@ -437,7 +437,7 @@ static void rsnd_adg_get_clkout(struct rsnd_priv *priv, struct device *dev = rsnd_priv_to_dev(priv); struct device_node *np = dev->of_node; u32 ckr, rbgx, rbga, rbgb; - u32 rate, req_rate, div; + u32 rate, req_rate = 0, div; uint32_t count = 0; unsigned long req_48kHz_rate, req_441kHz_rate; int i; From 95fc5ef85428cc435c82b041a49631f6a680e44b Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Tue, 17 Nov 2015 08:28:11 +0000 Subject: [PATCH 069/546] ASoC: rsnd: ssi: 24bit data needs right-aligned settings commit f46a93b820eb3707faf238cd769a004e2504515f upstream. Data left/right aligned is controlled by PDTA bit on SSICR. But default is left-aligned. Thus 24bit sound will be very small sound without this patch. Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/ssi.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/sound/soc/sh/rcar/ssi.c b/sound/soc/sh/rcar/ssi.c index 1427ec21bd7e..c62a2947ac14 100644 --- a/sound/soc/sh/rcar/ssi.c +++ b/sound/soc/sh/rcar/ssi.c @@ -39,6 +39,7 @@ #define SCKP (1 << 13) /* Serial Bit Clock Polarity */ #define SWSP (1 << 12) /* Serial WS Polarity */ #define SDTA (1 << 10) /* Serial Data Alignment */ +#define PDTA (1 << 9) /* Parallel Data Alignment */ #define DEL (1 << 8) /* Serial Data Delay */ #define CKDV(v) (v << 4) /* Serial Clock Division Ratio */ #define TRMD (1 << 1) /* Transmit/Receive Mode Select */ @@ -286,7 +287,7 @@ static int rsnd_ssi_init(struct rsnd_mod *mod, struct snd_pcm_runtime *runtime = rsnd_io_to_runtime(io); u32 cr; - cr = FORCE; + cr = FORCE | PDTA; /* * always use 32bit system word for easy clock calculation. From 4ec0b2c2d2354ca5322513b83cbf925eec8c38ba Mon Sep 17 00:00:00 2001 From: Kuninori Morimoto Date: Thu, 25 Feb 2016 05:51:12 +0000 Subject: [PATCH 070/546] ASoC: rsnd: don't call update callback if it was NULL commit d7289565483c65094d0473555625a4acd89567d3 upstream. Signed-off-by: Kuninori Morimoto Signed-off-by: Mark Brown Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- sound/soc/sh/rcar/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/sound/soc/sh/rcar/core.c b/sound/soc/sh/rcar/core.c index 10b93991ae97..362446c36c9e 100644 --- a/sound/soc/sh/rcar/core.c +++ b/sound/soc/sh/rcar/core.c @@ -1016,7 +1016,7 @@ static int rsnd_kctrl_put(struct snd_kcontrol *kctrl, } } - if (change) + if (change && cfg->update) cfg->update(cfg->io, mod); return change; From 1aac8ffd619f893b93cbd4c28abe43ecee7f82ef Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 5 Jun 2017 14:00:52 -0600 Subject: [PATCH 071/546] ntb_transport: fix qp count bug commit cb827ee6ccc3e480f0d9c0e8e53eef55be5b0414 upstream. In cases where there are more mw's than spads/2-2, the mw count gets reduced to match the limitation. ntb_transport also tries to ensure that there are fewer qps than mws but uses the full mw count instead of the reduced one. When this happens, the math in 'ntb_transport_setup_qp_mw' will get confused and result in a kernel paging request bug. This patch fixes the bug by reducing qp_count to the reduced mw count instead of the full mw count. Signed-off-by: Logan Gunthorpe Fixes: e26a5843f7f5 ("NTB: Split ntb_hw_intel and ntb_transport drivers") Acked-by: Allen Hubbe Signed-off-by: Jon Mason Signed-off-by: Greg Kroah-Hartman --- drivers/ntb/ntb_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index ecc6fb9ca92f..1c02a48a7e99 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -1065,8 +1065,8 @@ static int ntb_transport_probe(struct ntb_client *self, struct ntb_dev *ndev) qp_count = ilog2(qp_bitmap); if (max_num_clients && max_num_clients < qp_count) qp_count = max_num_clients; - else if (mw_count < qp_count) - qp_count = mw_count; + else if (nt->mw_count < qp_count) + qp_count = nt->mw_count; qp_bitmap &= BIT_ULL(qp_count) - 1; From 4e5f2c2041503bd0f855b9467de3cd05a8748c91 Mon Sep 17 00:00:00 2001 From: Logan Gunthorpe Date: Mon, 5 Jun 2017 14:00:53 -0600 Subject: [PATCH 072/546] ntb_transport: fix bug calculating num_qps_mw commit 8e8496e0e9564b66165f5219a4e8ed20b0d3fc6b upstream. A divide by zero error occurs if qp_count is less than mw_count because num_qps_mw is calculated to be zero. The calculation appears to be incorrect. The requirement is for num_qps_mw to be set to qp_count / mw_count with any remainder divided among the earlier mws. For example, if mw_count is 5 and qp_count is 12 then mws 0 and 1 will have 3 qps per window and mws 2 through 4 will have 2 qps per window. Thus, when mw_num < qp_count % mw_count, num_qps_mw is 1 higher than when mw_num >= qp_count. Signed-off-by: Logan Gunthorpe Fixes: e26a5843f7f5 ("NTB: Split ntb_hw_intel and ntb_transport drivers") Acked-by: Allen Hubbe Signed-off-by: Jon Mason Signed-off-by: Greg Kroah-Hartman --- drivers/ntb/ntb_transport.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 1c02a48a7e99..3bbdf60f8908 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -599,7 +599,7 @@ static int ntb_transport_setup_qp_mw(struct ntb_transport_ctx *nt, if (!mw->virt_addr) return -ENOMEM; - if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count) + if (mw_num < qp_count % mw_count) num_qps_mw = qp_count / mw_count + 1; else num_qps_mw = qp_count / mw_count; @@ -947,7 +947,7 @@ static int ntb_transport_init_queue(struct ntb_transport_ctx *nt, qp->event_handler = NULL; ntb_qp_link_down_reset(qp); - if (qp_count % mw_count && mw_num + 1 < qp_count / mw_count) + if (mw_num < qp_count % mw_count) num_qps_mw = qp_count / mw_count + 1; else num_qps_mw = qp_count / mw_count; From b526de00a9b09eb12f1c68df25a50139f315e400 Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Wed, 22 Mar 2017 18:33:23 +0100 Subject: [PATCH 073/546] ACPI: ioapic: Clear on-stack resource before using it commit e3d5092b6756b9e0b08f94bbeafcc7afe19f0996 upstream. The on-stack resource-window 'win' in setup_res() is not properly initialized. This causes the pointers in the embedded 'struct resource' to contain stale addresses. These pointers (in my case the ->child pointer) later get propagated to the global iomem_resources list, causing a #GP exception when the list is traversed in iomem_map_sanity_check(). Fixes: c183619b63ec (x86/irq, ACPI: Implement ACPI driver to support IOAPIC hotplug) Signed-off-by: Joerg Roedel Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/ioapic.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/acpi/ioapic.c b/drivers/acpi/ioapic.c index ccdc8db16bb8..fa2cf2dc4e33 100644 --- a/drivers/acpi/ioapic.c +++ b/drivers/acpi/ioapic.c @@ -45,6 +45,12 @@ static acpi_status setup_res(struct acpi_resource *acpi_res, void *data) struct resource *res = data; struct resource_win win; + /* + * We might assign this to 'res' later, make sure all pointers are + * cleared before the resource is added to the global list + */ + memset(&win, 0, sizeof(win)); + res->flags = 0; if (acpi_dev_filter_resource_type(acpi_res, IORESOURCE_MEM) == 0) return AE_OK; From 12b25d2a52f0ff998ce1d4385c4050e9e9e5d072 Mon Sep 17 00:00:00 2001 From: James Morse Date: Thu, 16 Mar 2017 14:30:39 +0000 Subject: [PATCH 074/546] ACPI / APEI: Add missing synchronize_rcu() on NOTIFY_SCI removal commit 7d64f82cceb21e6d95db312d284f5f195e120154 upstream. When removing a GHES device notified by SCI, list_del_rcu() is used, ghes_remove() should call synchronize_rcu() before it goes on to call kfree(ghes), otherwise concurrent RCU readers may still hold this list entry after it has been freed. Signed-off-by: James Morse Reviewed-by: "Huang, Ying" Fixes: 81e88fdc432a (ACPI, APEI, Generic Hardware Error Source POLL/IRQ/NMI notification type support) Signed-off-by: Rafael J. Wysocki Signed-off-by: Greg Kroah-Hartman --- drivers/acpi/apei/ghes.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/apei/ghes.c b/drivers/acpi/apei/ghes.c index eac4f3b02df9..bb81cd05f0bc 100644 --- a/drivers/acpi/apei/ghes.c +++ b/drivers/acpi/apei/ghes.c @@ -1067,6 +1067,7 @@ static int ghes_remove(struct platform_device *ghes_dev) if (list_empty(&ghes_sci)) unregister_acpi_hed_notifier(&ghes_notifier_sci); mutex_unlock(&ghes_list_mutex); + synchronize_rcu(); break; case ACPI_HEST_NOTIFY_NMI: ghes_nmi_remove(ghes); From 717bd21f81a3ac5cb50d015b200f3949be1b1923 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 30 Aug 2017 10:19:42 +0200 Subject: [PATCH 075/546] Linux 4.4.85 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 9d77ac063ec0..0f3d843f42a7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 84 +SUBLEVEL = 85 EXTRAVERSION = NAME = Blurry Fish Butt From da8477a6695afe14070267b885e0bc3788d0d34a Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Fri, 18 Nov 2016 17:14:01 +0100 Subject: [PATCH 076/546] scsi: isci: avoid array subscript warning commit 5cfa2a3c7342bd0b50716c8bb32ee491af43c785 upstream. I'm getting a new warning with gcc-7: isci/remote_node_context.c: In function 'sci_remote_node_context_destruct': isci/remote_node_context.c:69:16: error: array subscript is above array bounds [-Werror=array-bounds] This is odd, since we clearly cover all values for enum scis_sds_remote_node_context_states here. Anyway, checking for an array overflow can't harm and it makes the warning go away. Signed-off-by: Arnd Bergmann Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/isci/remote_node_context.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/scsi/isci/remote_node_context.c b/drivers/scsi/isci/remote_node_context.c index 1910100638a2..00602abec0ea 100644 --- a/drivers/scsi/isci/remote_node_context.c +++ b/drivers/scsi/isci/remote_node_context.c @@ -66,6 +66,9 @@ const char *rnc_state_name(enum scis_sds_remote_node_context_states state) { static const char * const strings[] = RNC_STATES; + if (state >= ARRAY_SIZE(strings)) + return "UNKNOWN"; + return strings[state]; } #undef C From 9a64425945a0c8698e36ed55354db1245982c052 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 12 May 2016 18:04:16 +0200 Subject: [PATCH 077/546] ALSA: au88x0: Fix zero clear of stream->resources commit 639db596165746ca87bbcb56559b094fd9042890 upstream. There are a few calls of memset() to stream->resources, but they all are called in a wrong size, sizeof(unsigned char) * VORTEX_RESOURCE_LAST, while this field is a u32 array. This may leave the memories not zero-cleared. Fix it by replacing them with a simpler sizeof(stream->resources) instead. Reported-by: David Binderman Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/au88x0/au88x0_core.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/sound/pci/au88x0/au88x0_core.c b/sound/pci/au88x0/au88x0_core.c index 74177189063c..d3125c169684 100644 --- a/sound/pci/au88x0/au88x0_core.c +++ b/sound/pci/au88x0/au88x0_core.c @@ -2150,8 +2150,7 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, stream->resources, en, VORTEX_RESOURCE_SRC)) < 0) { memset(stream->resources, 0, - sizeof(unsigned char) * - VORTEX_RESOURCE_LAST); + sizeof(stream->resources)); return -EBUSY; } if (stream->type != VORTEX_PCM_A3D) { @@ -2161,7 +2160,7 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, VORTEX_RESOURCE_MIXIN)) < 0) { memset(stream->resources, 0, - sizeof(unsigned char) * VORTEX_RESOURCE_LAST); + sizeof(stream->resources)); return -EBUSY; } } @@ -2174,8 +2173,7 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, stream->resources, en, VORTEX_RESOURCE_A3D)) < 0) { memset(stream->resources, 0, - sizeof(unsigned char) * - VORTEX_RESOURCE_LAST); + sizeof(stream->resources)); dev_err(vortex->card->dev, "out of A3D sources. Sorry\n"); return -EBUSY; @@ -2289,8 +2287,7 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, VORTEX_RESOURCE_MIXOUT)) < 0) { memset(stream->resources, 0, - sizeof(unsigned char) * - VORTEX_RESOURCE_LAST); + sizeof(stream->resources)); return -EBUSY; } if ((src[i] = @@ -2298,8 +2295,7 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, stream->resources, en, VORTEX_RESOURCE_SRC)) < 0) { memset(stream->resources, 0, - sizeof(unsigned char) * - VORTEX_RESOURCE_LAST); + sizeof(stream->resources)); return -EBUSY; } } From 05429bbfd726ec0070a2381d8ae186b6899d2ae1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Tue, 19 Jan 2016 00:05:28 +0000 Subject: [PATCH 078/546] btrfs: remove duplicate const specifier commit fb75d857a31d600cc0c37b8c7d914014f7fa3f9a upstream. duplicate const is redundant so remove it Signed-off-by: Colin Ian King Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9c62a6f9757a..600c67ef8a03 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, From 41685ae5cd7bc51f8d796135611e7862f772ed19 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Sun, 5 Jun 2016 11:41:42 +0200 Subject: [PATCH 079/546] i2c: jz4780: drop superfluous init commit 27bfeb5a0619554d9734fb39e14f0e80fa7c342c upstream. David reported that the length for memset was incorrect (element sizes were not taken into account). Then I saw that we are clearing kzalloced memory, so we can simply drop this code. Reported-by: David Binderman Reviewed-by: Axel Lin Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-jz4780.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/i2c/busses/i2c-jz4780.c b/drivers/i2c/busses/i2c-jz4780.c index f325663c27c5..4b58e8aaf5c5 100644 --- a/drivers/i2c/busses/i2c-jz4780.c +++ b/drivers/i2c/busses/i2c-jz4780.c @@ -786,10 +786,6 @@ static int jz4780_i2c_probe(struct platform_device *pdev) jz4780_i2c_writew(i2c, JZ4780_I2C_INTM, 0x0); - i2c->cmd = 0; - memset(i2c->cmd_buf, 0, BUFSIZE); - memset(i2c->data_buf, 0, BUFSIZE); - i2c->irq = platform_get_irq(pdev, 0); ret = devm_request_irq(&pdev->dev, i2c->irq, jz4780_i2c_irq, 0, dev_name(&pdev->dev), i2c); From 2f3e97a814c8f906d3334c85271aacb7a8783490 Mon Sep 17 00:00:00 2001 From: Florian Meier Date: Thu, 14 Jul 2016 12:07:26 -0700 Subject: [PATCH 080/546] gcov: add support for gcc version >= 6 commit d02038f972538b93011d78c068f44514fbde0a8c upstream. Link: http://lkml.kernel.org/r/20160701130914.GA23225@styxhp Signed-off-by: Florian Meier Reviewed-by: Peter Oberparleiter Tested-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/gcov/gcc_4_7.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index e25e92fb44fa..6a5c239c7669 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,7 @@ #include #include "gcov.h" -#if __GNUC__ == 5 && __GNUC_MINOR__ >= 1 +#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 From d255fffdb532caf103369a5894be942e528c3151 Mon Sep 17 00:00:00 2001 From: Martin Liska Date: Fri, 12 May 2017 15:46:35 -0700 Subject: [PATCH 081/546] gcov: support GCC 7.1 commit 05384213436ab690c46d9dfec706b80ef8d671ab upstream. Starting from GCC 7.1, __gcov_exit is a new symbol expected to be implemented in a profiling runtime. [akpm@linux-foundation.org: coding-style fixes] [mliska@suse.cz: v2] Link: http://lkml.kernel.org/r/e63a3c59-0149-c97e-4084-20ca8f146b26@suse.cz Link: http://lkml.kernel.org/r/8c4084fa-3885-29fe-5fc4-0d4ca199c785@suse.cz Signed-off-by: Martin Liska Acked-by: Peter Oberparleiter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- kernel/gcov/base.c | 6 ++++++ kernel/gcov/gcc_4_7.c | 4 +++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/kernel/gcov/base.c b/kernel/gcov/base.c index 7080ae1eb6c1..f850e906564b 100644 --- a/kernel/gcov/base.c +++ b/kernel/gcov/base.c @@ -98,6 +98,12 @@ void __gcov_merge_icall_topn(gcov_type *counters, unsigned int n_counters) } EXPORT_SYMBOL(__gcov_merge_icall_topn); +void __gcov_exit(void) +{ + /* Unused. */ +} +EXPORT_SYMBOL(__gcov_exit); + /** * gcov_enable_events - enable event reporting through gcov_event() * diff --git a/kernel/gcov/gcc_4_7.c b/kernel/gcov/gcc_4_7.c index 6a5c239c7669..46a18e72bce6 100644 --- a/kernel/gcov/gcc_4_7.c +++ b/kernel/gcov/gcc_4_7.c @@ -18,7 +18,9 @@ #include #include "gcov.h" -#if (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) +#if (__GNUC__ >= 7) +#define GCOV_COUNTERS 9 +#elif (__GNUC__ > 5) || (__GNUC__ == 5 && __GNUC_MINOR__ >= 1) #define GCOV_COUNTERS 10 #elif __GNUC__ == 4 && __GNUC_MINOR__ >= 9 #define GCOV_COUNTERS 9 From 5acdbe667cf467cefb8fc13ded0ecca58dd9a761 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Javier=20Gonz=C3=A1lez?= Date: Thu, 7 Jul 2016 09:54:09 +0200 Subject: [PATCH 082/546] lightnvm: initialize ppa_addr in dev_to_generic_addr() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5389a1dfb39786df08d4f6a482bd2734b1b50e33 upstream. The ->reserved bit is not initialized when allocated on stack. This may lead targets to misinterpret the PPA as cached. Signed-off-by: Javier González Signed-off-by: Matias Bjørling Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- include/linux/lightnvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 782d4e814e21..4bc4b1b13193 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -310,6 +310,7 @@ static inline struct ppa_addr dev_to_generic_addr(struct nvm_dev *dev, { struct ppa_addr l; + l.ppa = 0; /* * (r.ppa << X offset) & X len bitmask. X eq. blk, pg, etc. */ From 389328ea1379e14e8bd8678af96d7a83bc1514e5 Mon Sep 17 00:00:00 2001 From: Jiri Slaby Date: Fri, 14 Oct 2016 11:23:09 +0200 Subject: [PATCH 083/546] p54: memset(0) whole array commit 6f17581788206444cbbcdbc107498f85e9765e3d upstream. gcc 7 complains: drivers/net/wireless/intersil/p54/fwio.c: In function 'p54_scan': drivers/net/wireless/intersil/p54/fwio.c:491:4: warning: 'memset' used with length equal to number of elements without multiplication by element size [-Wmemset-elt-size] Fix that by passing the correct size to memset. Signed-off-by: Jiri Slaby Cc: Christian Lamparter Cc: Kalle Valo Acked-by: Christian Lamparter Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/p54/fwio.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/wireless/p54/fwio.c b/drivers/net/wireless/p54/fwio.c index 257a9eadd595..4ac6764f4897 100644 --- a/drivers/net/wireless/p54/fwio.c +++ b/drivers/net/wireless/p54/fwio.c @@ -488,7 +488,7 @@ int p54_scan(struct p54_common *priv, u16 mode, u16 dwell) entry += sizeof(__le16); chan->pa_points_per_curve = 8; - memset(chan->curve_data, 0, sizeof(*chan->curve_data)); + memset(chan->curve_data, 0, sizeof(chan->curve_data)); memcpy(chan->curve_data, entry, sizeof(struct p54_pa_curve_data_sample) * min((u8)8, curve_data->points_per_channel)); From da981044d049d1ec3bf454868aa44602e1df8582 Mon Sep 17 00:00:00 2001 From: James Smart Date: Thu, 31 Mar 2016 14:12:31 -0700 Subject: [PATCH 084/546] lpfc: Fix Device discovery failures during switch reboot test. commit 342b59caa66240b670285d519fdfe2c44289b516 upstream. When the switch is rebooted, the lpfc driver fails to log into the fabric, and Unexpected timeout message is seen. Fix: Do not issue RegVFI if the FLOGI was internally aborted. Signed-off-by: Dick Kennedy Signed-off-by: James Smart Signed-off-by: Martin K. Petersen Signed-off-by: Guilherme G. Piccoli Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/lpfc/lpfc_els.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/lpfc/lpfc_els.c b/drivers/scsi/lpfc/lpfc_els.c index 0e6aaef9a038..c74f74ab981c 100644 --- a/drivers/scsi/lpfc/lpfc_els.c +++ b/drivers/scsi/lpfc/lpfc_els.c @@ -1054,7 +1054,10 @@ lpfc_cmpl_els_flogi(struct lpfc_hba *phba, struct lpfc_iocbq *cmdiocb, lpfc_sli4_unreg_all_rpis(vport); } } - lpfc_issue_reg_vfi(vport); + + /* Do not register VFI if the driver aborted FLOGI */ + if (!lpfc_error_lost_link(irsp)) + lpfc_issue_reg_vfi(vport); lpfc_nlp_put(ndlp); goto out; } From a7a074f3a4d547a525748bfde179c4eb787d4f47 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 11 Jul 2017 15:19:22 +0100 Subject: [PATCH 085/546] arm64: mm: abort uaccess retries upon fatal signal commit 289d07a2dc6c6b6f3e4b8a62669320d99dbe6c3d upstream. When there's a fatal signal pending, arm64's do_page_fault() implementation returns 0. The intent is that we'll return to the faulting userspace instruction, delivering the signal on the way. However, if we take a fatal signal during fixing up a uaccess, this results in a return to the faulting kernel instruction, which will be instantly retried, resulting in the same fault being taken forever. As the task never reaches userspace, the signal is not delivered, and the task is left unkillable. While the task is stuck in this state, it can inhibit the forward progress of the system. To avoid this, we must ensure that when a fatal signal is pending, we apply any necessary fixup for a faulting kernel instruction. Thus we will return to an error path, and it is up to that code to make forward progress towards delivering the fatal signal. Cc: Catalin Marinas Cc: Laura Abbott Reviewed-by: Steve Capper Tested-by: Steve Capper Reviewed-by: James Morse Tested-by: James Morse Signed-off-by: Mark Rutland Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index a4b466424a32..7fabf49f2aeb 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -313,8 +313,11 @@ static int __kprobes do_page_fault(unsigned long addr, unsigned int esr, * signal first. We do not need to release the mmap_sem because it * would already be released in __lock_page_or_retry in mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (!user_mode(regs)) + goto no_context; return 0; + } /* * Major/minor page fault accounting is only done on the initial From 218720fe593965cb2bb45c3e463bb9991ceca40c Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 19 Jul 2017 14:53:02 +0200 Subject: [PATCH 086/546] x86/io: Add "memory" clobber to insb/insw/insl/outsb/outsw/outsl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 7206f9bf108eb9513d170c73f151367a1bdf3dbf upstream. The x86 version of insb/insw/insl uses an inline assembly that does not have the target buffer listed as an output. This can confuse the compiler, leading it to think that a subsequent access of the buffer is uninitialized: drivers/net/wireless/wl3501_cs.c: In function ‘wl3501_mgmt_scan_confirm’: drivers/net/wireless/wl3501_cs.c:665:9: error: ‘sig.status’ is used uninitialized in this function [-Werror=uninitialized] drivers/net/wireless/wl3501_cs.c:668:12: error: ‘sig.cap_info’ may be used uninitialized in this function [-Werror=maybe-uninitialized] drivers/net/sb1000.c: In function 'sb1000_rx': drivers/net/sb1000.c:775:9: error: 'st[0]' is used uninitialized in this function [-Werror=uninitialized] drivers/net/sb1000.c:776:10: error: 'st[1]' may be used uninitialized in this function [-Werror=maybe-uninitialized] drivers/net/sb1000.c:784:11: error: 'st[1]' may be used uninitialized in this function [-Werror=maybe-uninitialized] I tried to mark the exact input buffer as an output here, but couldn't figure it out. As suggested by Linus, marking all memory as clobbered however is good enough too. For the outs operations, I also add the memory clobber, to force the input to be written to local variables. This is probably already guaranteed by the "asm volatile", but it can't hurt to do this for symmetry. Suggested-by: Linus Torvalds Signed-off-by: Arnd Bergmann Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tom Lendacky Link: http://lkml.kernel.org/r/20170719125310.2487451-5-arnd@arndb.de Link: https://lkml.org/lkml/2017/7/12/605 Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/io.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/io.h b/arch/x86/include/asm/io.h index de25aad07853..9016b4b70375 100644 --- a/arch/x86/include/asm/io.h +++ b/arch/x86/include/asm/io.h @@ -304,13 +304,13 @@ static inline unsigned type in##bwl##_p(int port) \ static inline void outs##bwl(int port, const void *addr, unsigned long count) \ { \ asm volatile("rep; outs" #bwl \ - : "+S"(addr), "+c"(count) : "d"(port)); \ + : "+S"(addr), "+c"(count) : "d"(port) : "memory"); \ } \ \ static inline void ins##bwl(int port, void *addr, unsigned long count) \ { \ asm volatile("rep; ins" #bwl \ - : "+D"(addr), "+c"(count) : "d"(port)); \ + : "+D"(addr), "+c"(count) : "d"(port) : "memory"); \ } BUILDIO(b, b, char) From 823086b057aabde5659c5f8638051613cba86247 Mon Sep 17 00:00:00 2001 From: Dave Martin Date: Fri, 18 Aug 2017 16:57:01 +0100 Subject: [PATCH 087/546] arm64: fpsimd: Prevent registers leaking across exec commit 096622104e14d8a1db4860bd557717067a0515d2 upstream. There are some tricky dependencies between the different stages of flushing the FPSIMD register state during exec, and these can race with context switch in ways that can cause the old task's regs to leak across. In particular, a context switch during the memset() can cause some of the task's old FPSIMD registers to reappear. Disabling preemption for this small window would be no big deal for performance: preemption is already disabled for similar scenarios like updating the FPSIMD registers in sigreturn. So, instead of rearranging things in ways that might swap existing subtle bugs for new ones, this patch just disables preemption around the FPSIMD state flushing so that races of this type can't occur here. This brings fpsimd_flush_thread() into line with other code paths. Fixes: 674c242c9323 ("arm64: flush FP/SIMD state correctly after execve()") Reviewed-by: Ard Biesheuvel Signed-off-by: Dave Martin Signed-off-by: Will Deacon Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/fpsimd.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 4c46c54a3ad7..6638903f0cb9 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -157,9 +157,11 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { + preempt_disable(); memset(¤t->thread.fpsimd_state, 0, sizeof(struct fpsimd_state)); fpsimd_flush_task_state(current); set_thread_flag(TIF_FOREIGN_FPSTATE); + preempt_enable(); } /* From a4075bbb67b9562b9599affc6fb38f04bd7073ff Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Apr 2017 09:34:14 +0200 Subject: [PATCH 088/546] scsi: sg: protect accesses to 'reserved' page array commit 1bc0eb0446158cc76562176b80623aa119afee5b upstream. The 'reserved' page array is used as a short-cut for mapping data, saving us to allocate pages per request. However, the 'reserved' array is only capable of holding one request, so this patch introduces a mutex for protect 'sg_fd' against concurrent accesses. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman [toddpoynor@google.com: backport to 3.18-4.9, fixup for bad ioctl SG_SET_FORCE_LOW_DMA code removed in later versions and not modified by the original patch.] Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 47 ++++++++++++++++++++++++++--------------------- 1 file changed, 26 insertions(+), 21 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 6514636431ab..594ba5874693 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -153,6 +153,7 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ struct sg_device *parentdp; /* owning device */ wait_queue_head_t read_wait; /* queue read until command done */ rwlock_t rq_list_lock; /* protect access to list in req_arr */ + struct mutex f_mutex; /* protect against changes in this fd */ int timeout; /* defaults to SG_DEFAULT_TIMEOUT */ int timeout_user; /* defaults to SG_DEFAULT_TIMEOUT_USER */ Sg_scatter_hold reserve; /* buffer held for this file descriptor */ @@ -166,6 +167,7 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ unsigned char next_cmd_len; /* 0: automatic, >0: use on next write() */ char keep_orphan; /* 0 -> drop orphan (def), 1 -> keep for read() */ char mmap_called; /* 0 -> mmap() never called on this fd */ + char res_in_use; /* 1 -> 'reserve' array in use */ struct kref f_ref; struct execute_work ew; } Sg_fd; @@ -209,7 +211,6 @@ static void sg_remove_sfp(struct kref *); static Sg_request *sg_get_rq_mark(Sg_fd * sfp, int pack_id); static Sg_request *sg_add_request(Sg_fd * sfp); static int sg_remove_request(Sg_fd * sfp, Sg_request * srp); -static int sg_res_in_use(Sg_fd * sfp); static Sg_device *sg_get_dev(int dev); static void sg_device_destroy(struct kref *kref); @@ -625,6 +626,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) } buf += SZ_SG_HEADER; __get_user(opcode, buf); + mutex_lock(&sfp->f_mutex); if (sfp->next_cmd_len > 0) { cmd_size = sfp->next_cmd_len; sfp->next_cmd_len = 0; /* reset so only this write() effected */ @@ -633,6 +635,7 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) if ((opcode >= 0xc0) && old_hdr.twelve_byte) cmd_size = 12; } + mutex_unlock(&sfp->f_mutex); SCSI_LOG_TIMEOUT(4, sg_printk(KERN_INFO, sdp, "sg_write: scsi opcode=0x%02x, cmd_size=%d\n", (int) opcode, cmd_size)); /* Determine buffer size. */ @@ -732,7 +735,7 @@ sg_new_write(Sg_fd *sfp, struct file *file, const char __user *buf, sg_remove_request(sfp, srp); return -EINVAL; /* either MMAP_IO or DIRECT_IO (not both) */ } - if (sg_res_in_use(sfp)) { + if (sfp->res_in_use) { sg_remove_request(sfp, srp); return -EBUSY; /* reserve buffer already being used */ } @@ -902,7 +905,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return result; if (val) { sfp->low_dma = 1; - if ((0 == sfp->low_dma) && (0 == sg_res_in_use(sfp))) { + if ((0 == sfp->low_dma) && !sfp->res_in_use) { val = (int) sfp->reserve.bufflen; sg_remove_scat(sfp, &sfp->reserve); sg_build_reserve(sfp, val); @@ -977,12 +980,18 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return -EINVAL; val = min_t(int, val, max_sectors_bytes(sdp->device->request_queue)); + mutex_lock(&sfp->f_mutex); if (val != sfp->reserve.bufflen) { - if (sg_res_in_use(sfp) || sfp->mmap_called) + if (sfp->mmap_called || + sfp->res_in_use) { + mutex_unlock(&sfp->f_mutex); return -EBUSY; + } + sg_remove_scat(sfp, &sfp->reserve); sg_build_reserve(sfp, val); } + mutex_unlock(&sfp->f_mutex); return 0; case SG_GET_RESERVED_SIZE: val = min_t(int, sfp->reserve.bufflen, @@ -1737,13 +1746,22 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) md = &map_data; if (md) { - if (!sg_res_in_use(sfp) && dxfer_len <= rsv_schp->bufflen) + mutex_lock(&sfp->f_mutex); + if (dxfer_len <= rsv_schp->bufflen && + !sfp->res_in_use) { + sfp->res_in_use = 1; sg_link_reserve(sfp, srp, dxfer_len); - else { + } else if ((hp->flags & SG_FLAG_MMAP_IO) && sfp->res_in_use) { + mutex_unlock(&sfp->f_mutex); + return -EBUSY; + } else { res = sg_build_indirect(req_schp, sfp, dxfer_len); - if (res) + if (res) { + mutex_unlock(&sfp->f_mutex); return res; + } } + mutex_unlock(&sfp->f_mutex); md->pages = req_schp->pages; md->page_order = req_schp->page_order; @@ -2145,6 +2163,7 @@ sg_add_sfp(Sg_device * sdp) rwlock_init(&sfp->rq_list_lock); kref_init(&sfp->f_ref); + mutex_init(&sfp->f_mutex); sfp->timeout = SG_DEFAULT_TIMEOUT; sfp->timeout_user = SG_DEFAULT_TIMEOUT_USER; sfp->force_packid = SG_DEF_FORCE_PACK_ID; @@ -2220,20 +2239,6 @@ sg_remove_sfp(struct kref *kref) schedule_work(&sfp->ew.work); } -static int -sg_res_in_use(Sg_fd * sfp) -{ - const Sg_request *srp; - unsigned long iflags; - - read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp; srp; srp = srp->nextrp) - if (srp->res_used) - break; - read_unlock_irqrestore(&sfp->rq_list_lock, iflags); - return srp ? 1 : 0; -} - #ifdef CONFIG_SCSI_PROC_FS static int sg_idr_max_id(int id, void *p, void *data) From b7571624fe986aecc7f42b421286f90cd7c67d6e Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Mon, 24 Apr 2017 10:26:36 +0200 Subject: [PATCH 089/546] scsi: sg: reset 'res_in_use' after unlinking reserved array commit e791ce27c3f6a1d3c746fd6a8f8e36c9540ec6f9 upstream. Once the reserved page array is unused we can reset the 'res_in_use' state; here we can do a lazy update without holding the mutex as we only need to check against concurrent access, not concurrent release. [mkp: checkpatch] Fixes: 1bc0eb044615 ("scsi: sg: protect accesses to 'reserved' page array") Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Cc: Todd Poynor Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 594ba5874693..8a9e139e2853 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -2052,6 +2052,8 @@ sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp) req_schp->sglist_len = 0; sfp->save_scat_len = 0; srp->res_used = 0; + /* Called without mutex lock to avoid deadlock */ + sfp->res_in_use = 0; } static Sg_request * From c81c4d453edf242af644f7e71f673fc7632f337c Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 31 Aug 2017 08:30:43 +0200 Subject: [PATCH 090/546] drm/i915: fix compiler warning in drivers/gpu/drm/i915/intel_uncore.c MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When building with gcc-7, the following warning happens: drivers/gpu/drm/i915/intel_uncore.c: In function ‘hsw_unclaimed_reg_detect’: drivers/gpu/drm/i915/intel_uncore.c:638:36: warning: decrement of a boolean expression [-Wbool-operation] i915.mmio_debug = mmio_debug_once--; ^~ As it's really not wise to -- on a boolean value. Commit 7571494004d8 ("drm/i915: Do one shot unclaimed mmio detection less frequently") which showed up in 4.6-rc1 does solve this issue, by rewriting the mmio detection logic, but that isn't really good to backport to 4.4-stable, so just fix up the obvious logic here to do the right thing. Cc: Chris Wilson Cc: Paulo Zanoni Cc: Mika Kuoppala Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_uncore.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i915/intel_uncore.c b/drivers/gpu/drm/i915/intel_uncore.c index cc91ae832ffb..6fd7b50c5747 100644 --- a/drivers/gpu/drm/i915/intel_uncore.c +++ b/drivers/gpu/drm/i915/intel_uncore.c @@ -635,7 +635,8 @@ hsw_unclaimed_reg_detect(struct drm_i915_private *dev_priv) "enabling oneshot unclaimed register reporting. " "Please use i915.mmio_debug=N for more information.\n"); __raw_i915_write32(dev_priv, FPGA_DBG, FPGA_DBG_RM_NOCLAIM); - i915.mmio_debug = mmio_debug_once--; + i915.mmio_debug = mmio_debug_once; + mmio_debug_once = false; } } From cd99a4f3f43ba1b88ff0ef01a9d5564fdf824c38 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 2 Sep 2017 07:07:05 +0200 Subject: [PATCH 091/546] Linux 4.4.86 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 0f3d843f42a7..1207bf6a0e7a 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 85 +SUBLEVEL = 86 EXTRAVERSION = NAME = Blurry Fish Butt From 2de71be457d1dfddf356a8d32e78056d46e4e6aa Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 4 Sep 2017 17:47:24 +0800 Subject: [PATCH 092/546] ARM64: add lsk_defconfig Add a arm64 default config named lsk_defconfig, which copied from arch/arm64/configs/defconfig plus TEE, RANDOMIZE_BASE etc config. The hibernate and kexec/kdump config were enabled as default already. Signed-off-by: Alex Shi --- arch/arm64/configs/lsk_defconfig | 235 +++++++++++++++++++++++++++++++ 1 file changed, 235 insertions(+) create mode 100644 arch/arm64/configs/lsk_defconfig diff --git a/arch/arm64/configs/lsk_defconfig b/arch/arm64/configs/lsk_defconfig new file mode 100644 index 000000000000..bf7a37d09656 --- /dev/null +++ b/arch/arm64/configs/lsk_defconfig @@ -0,0 +1,235 @@ +# CONFIG_LOCALVERSION_AUTO is not set +CONFIG_SYSVIPC=y +CONFIG_POSIX_MQUEUE=y +CONFIG_FHANDLE=y +CONFIG_AUDIT=y +CONFIG_NO_HZ_IDLE=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_BSD_PROCESS_ACCT=y +CONFIG_BSD_PROCESS_ACCT_V3=y +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=14 +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_KMEM=y +CONFIG_CGROUP_HUGETLB=y +# CONFIG_UTS_NS is not set +# CONFIG_IPC_NS is not set +# CONFIG_NET_NS is not set +CONFIG_SCHED_AUTOGROUP=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_KALLSYMS_ALL=y +# CONFIG_COMPAT_BRK is not set +CONFIG_PROFILING=y +CONFIG_JUMP_LABEL=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +# CONFIG_BLK_DEV_BSG is not set +# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_ARCH_BCM_IPROC=y +CONFIG_ARCH_BERLIN=y +CONFIG_ARCH_EXYNOS7=y +CONFIG_ARCH_LAYERSCAPE=y +CONFIG_ARCH_HISI=y +CONFIG_ARCH_MEDIATEK=y +CONFIG_ARCH_ROCKCHIP=y +CONFIG_ARCH_SEATTLE=y +CONFIG_ARCH_STRATIX10=y +CONFIG_ARCH_TEGRA=y +CONFIG_ARCH_TEGRA_132_SOC=y +CONFIG_ARCH_QCOM=y +CONFIG_ARCH_SPRD=y +CONFIG_ARCH_THUNDER=y +CONFIG_ARCH_VEXPRESS=y +CONFIG_ARCH_XGENE=y +CONFIG_ARCH_ZYNQMP=y +CONFIG_PCI=y +CONFIG_PCI_MSI=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCI_XGENE=y +CONFIG_SMP=y +CONFIG_SCHED_MC=y +CONFIG_PREEMPT=y +CONFIG_KSM=y +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_CMA=y +CONFIG_KEXEC=y +CONFIG_CRASH_DUMP=y +CONFIG_CMDLINE="console=ttyAMA0" +# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set +CONFIG_COMPAT=y +CONFIG_CPU_IDLE=y +CONFIG_ARM_CPUIDLE=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +# CONFIG_INET_LRO is not set +# CONFIG_IPV6 is not set +CONFIG_BPF_JIT=y +# CONFIG_WIRELESS is not set +CONFIG_NET_9P=y +CONFIG_NET_9P_VIRTIO=y +# CONFIG_TEGRA_AHB is not set +CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_DMA_CMA=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_VIRTIO_BLK=y +# CONFIG_SCSI_PROC_FS is not set +CONFIG_BLK_DEV_SD=y +# CONFIG_SCSI_LOWLEVEL is not set +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_AHCI_CEVA=y +CONFIG_AHCI_XGENE=y +CONFIG_PATA_PLATFORM=y +CONFIG_PATA_OF_PLATFORM=y +CONFIG_NETDEVICES=y +CONFIG_TUN=y +CONFIG_VIRTIO_NET=y +CONFIG_NET_XGENE=y +CONFIG_SKY2=y +CONFIG_SMC91X=y +CONFIG_SMSC911X=y +# CONFIG_WLAN is not set +CONFIG_INPUT_EVDEV=y +CONFIG_KEYBOARD_GPIO=y +# CONFIG_SERIO_SERPORT is not set +CONFIG_SERIO_AMBAKMI=y +CONFIG_LEGACY_PTY_COUNT=16 +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DW=y +CONFIG_SERIAL_8250_MT6577=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_SERIAL_SAMSUNG=y +CONFIG_SERIAL_SAMSUNG_UARTS_4=y +CONFIG_SERIAL_SAMSUNG_UARTS=4 +CONFIG_SERIAL_SAMSUNG_CONSOLE=y +CONFIG_SERIAL_MSM=y +CONFIG_SERIAL_MSM_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_XILINX_PS_UART=y +CONFIG_SERIAL_XILINX_PS_UART_CONSOLE=y +CONFIG_VIRTIO_CONSOLE=y +# CONFIG_HW_RANDOM is not set +CONFIG_I2C=y +CONFIG_I2C_QUP=y +CONFIG_SPI=y +CONFIG_SPI_PL022=y +CONFIG_SPI_QUP=y +CONFIG_PINCTRL_MSM8916=y +CONFIG_GPIO_PL061=y +CONFIG_GPIO_XGENE=y +CONFIG_POWER_RESET_XGENE=y +CONFIG_POWER_RESET_SYSCON=y +# CONFIG_HWMON is not set +CONFIG_REGULATOR=y +CONFIG_REGULATOR_FIXED_VOLTAGE=y +CONFIG_REGULATOR_QCOM_SMD_RPM=y +CONFIG_FB=y +CONFIG_FB_ARMCLCD=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_LOGO=y +# CONFIG_LOGO_LINUX_MONO is not set +# CONFIG_LOGO_LINUX_VGA16 is not set +CONFIG_USB=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_STORAGE=y +CONFIG_USB_ISP1760=y +CONFIG_USB_ULPI=y +CONFIG_MMC=y +CONFIG_MMC_ARMMMCI=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SPI=y +CONFIG_MMC_DW=y +CONFIG_MMC_DW_IDMAC=y +CONFIG_MMC_DW_PLTFM=y +CONFIG_MMC_DW_EXYNOS=y +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_SYSCON=y +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=y +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_EFI=y +CONFIG_RTC_DRV_XGENE=y +CONFIG_DMADEVICES=y +CONFIG_QCOM_BAM_DMA=y +CONFIG_VIRTIO_PCI=y +CONFIG_VIRTIO_BALLOON=y +CONFIG_VIRTIO_MMIO=y +CONFIG_COMMON_CLK_QCOM=y +CONFIG_MSM_GCC_8916=y +CONFIG_HWSPINLOCK_QCOM=y +# CONFIG_IOMMU_SUPPORT is not set +CONFIG_QCOM_SMEM=y +CONFIG_QCOM_SMD=y +CONFIG_QCOM_SMD_RPM=y +CONFIG_PHY_XGENE=y +CONFIG_EXT2_FS=y +CONFIG_EXT3_FS=y +# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set +# CONFIG_EXT3_FS_XATTR is not set +CONFIG_EXT4_FS=y +CONFIG_FANOTIFY=y +CONFIG_FANOTIFY_ACCESS_PERMISSIONS=y +CONFIG_QUOTA=y +CONFIG_AUTOFS4_FS=y +CONFIG_FUSE_FS=y +CONFIG_CUSE=y +CONFIG_VFAT_FS=y +CONFIG_TMPFS=y +CONFIG_HUGETLBFS=y +CONFIG_EFIVAR_FS=y +# CONFIG_MISC_FILESYSTEMS is not set +CONFIG_NFS_FS=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y +CONFIG_9P_FS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=y +CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_DEBUG_KERNEL=y +CONFIG_LOCKUP_DETECTOR=y +# CONFIG_SCHED_DEBUG is not set +# CONFIG_DEBUG_PREEMPT is not set +# CONFIG_FTRACE is not set +CONFIG_MEMTEST=y +CONFIG_SECURITY=y +CONFIG_CRYPTO_ANSI_CPRNG=y +CONFIG_ARM64_CRYPTO=y +CONFIG_CRYPTO_SHA1_ARM64_CE=y +CONFIG_CRYPTO_SHA2_ARM64_CE=y +CONFIG_CRYPTO_GHASH_ARM64_CE=y +CONFIG_CRYPTO_AES_ARM64_CE_CCM=y +CONFIG_CRYPTO_AES_ARM64_CE_BLK=y +CONFIG_CRYPTO_AES_ARM64_NEON_BLK=y +CONFIG_CRYPTO_CRC32_ARM64=y +CONFIG_HIBERNATION=y +CONFIG_KPROBES=y +CONFIG_CORESIGHT=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_TEE=y +CONFIG_OPTEE=y From 3fc37084db1bfabcf9c2f8dffdc3255567b329fd Mon Sep 17 00:00:00 2001 From: Alex Shi Date: Mon, 4 Sep 2017 18:17:07 +0800 Subject: [PATCH 093/546] ARM: add lsk_defconfig Add a arm default config named lsk_defconfig, which copied from arch/arm/configs/multi_v7_defconfig plus hibernate, kdump, random_base, coresight config for kernelci testing. Signed-off-by: Alex Shi --- arch/arm/configs/lsk_defconfig | 752 +++++++++++++++++++++++++++++++++ 1 file changed, 752 insertions(+) create mode 100644 arch/arm/configs/lsk_defconfig diff --git a/arch/arm/configs/lsk_defconfig b/arch/arm/configs/lsk_defconfig new file mode 100644 index 000000000000..ec07ed22987b --- /dev/null +++ b/arch/arm/configs/lsk_defconfig @@ -0,0 +1,752 @@ +CONFIG_SYSVIPC=y +CONFIG_FHANDLE=y +CONFIG_IRQ_DOMAIN_DEBUG=y +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +CONFIG_CGROUPS=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_EMBEDDED=y +CONFIG_PERF_EVENTS=y +CONFIG_MODULES=y +CONFIG_MODULE_UNLOAD=y +CONFIG_PARTITION_ADVANCED=y +CONFIG_CMDLINE_PARTITION=y +CONFIG_ARCH_VIRT=y +CONFIG_ARCH_ALPINE=y +CONFIG_ARCH_MVEBU=y +CONFIG_MACH_ARMADA_370=y +CONFIG_MACH_ARMADA_375=y +CONFIG_MACH_ARMADA_38X=y +CONFIG_MACH_ARMADA_39X=y +CONFIG_MACH_ARMADA_XP=y +CONFIG_MACH_DOVE=y +CONFIG_ARCH_AT91=y +CONFIG_SOC_SAMA5D2=y +CONFIG_SOC_SAMA5D3=y +CONFIG_SOC_SAMA5D4=y +CONFIG_ARCH_BCM=y +CONFIG_ARCH_BCM_CYGNUS=y +CONFIG_ARCH_BCM_NSP=y +CONFIG_ARCH_BCM_21664=y +CONFIG_ARCH_BCM_281XX=y +CONFIG_ARCH_BCM_5301X=y +CONFIG_ARCH_BRCMSTB=y +CONFIG_ARCH_BERLIN=y +CONFIG_MACH_BERLIN_BG2=y +CONFIG_MACH_BERLIN_BG2CD=y +CONFIG_MACH_BERLIN_BG2Q=y +CONFIG_ARCH_DIGICOLOR=y +CONFIG_ARCH_HIGHBANK=y +CONFIG_ARCH_HISI=y +CONFIG_ARCH_HI3xxx=y +CONFIG_ARCH_HIX5HD2=y +CONFIG_ARCH_HIP01=y +CONFIG_ARCH_HIP04=y +CONFIG_ARCH_KEYSTONE=y +CONFIG_ARCH_MESON=y +CONFIG_ARCH_MXC=y +CONFIG_SOC_IMX50=y +CONFIG_SOC_IMX51=y +CONFIG_SOC_IMX53=y +CONFIG_SOC_IMX6Q=y +CONFIG_SOC_IMX6SL=y +CONFIG_SOC_IMX6SX=y +CONFIG_SOC_IMX6UL=y +CONFIG_SOC_IMX7D=y +CONFIG_SOC_VF610=y +CONFIG_SOC_LS1021A=y +CONFIG_ARCH_OMAP3=y +CONFIG_ARCH_OMAP4=y +CONFIG_SOC_OMAP5=y +CONFIG_SOC_AM33XX=y +CONFIG_SOC_AM43XX=y +CONFIG_SOC_DRA7XX=y +CONFIG_ARCH_QCOM=y +CONFIG_ARCH_MEDIATEK=y +CONFIG_ARCH_MSM8X60=y +CONFIG_ARCH_MSM8960=y +CONFIG_ARCH_MSM8974=y +CONFIG_ARCH_ROCKCHIP=y +CONFIG_ARCH_SOCFPGA=y +CONFIG_PLAT_SPEAR=y +CONFIG_ARCH_SPEAR13XX=y +CONFIG_MACH_SPEAR1310=y +CONFIG_MACH_SPEAR1340=y +CONFIG_ARCH_STI=y +CONFIG_ARCH_EXYNOS=y +CONFIG_EXYNOS5420_MCPM=y +CONFIG_ARCH_SHMOBILE_MULTI=y +CONFIG_ARCH_EMEV2=y +CONFIG_ARCH_R7S72100=y +CONFIG_ARCH_R8A73A4=y +CONFIG_ARCH_R8A7740=y +CONFIG_ARCH_R8A7778=y +CONFIG_ARCH_R8A7779=y +CONFIG_ARCH_R8A7790=y +CONFIG_ARCH_R8A7791=y +CONFIG_ARCH_R8A7793=y +CONFIG_ARCH_R8A7794=y +CONFIG_ARCH_SH73A0=y +CONFIG_ARCH_SUNXI=y +CONFIG_ARCH_SIRF=y +CONFIG_ARCH_TEGRA=y +CONFIG_ARCH_TEGRA_2x_SOC=y +CONFIG_ARCH_TEGRA_3x_SOC=y +CONFIG_ARCH_TEGRA_114_SOC=y +CONFIG_ARCH_TEGRA_124_SOC=y +CONFIG_TEGRA_EMC_SCALING_ENABLE=y +CONFIG_ARCH_UNIPHIER=y +CONFIG_ARCH_U8500=y +CONFIG_MACH_HREFV60=y +CONFIG_MACH_SNOWBALL=y +CONFIG_MACH_UX500_DT=y +CONFIG_ARCH_VEXPRESS=y +CONFIG_ARCH_VEXPRESS_CA9X4=y +CONFIG_ARCH_VEXPRESS_TC2_PM=y +CONFIG_ARCH_WM8850=y +CONFIG_ARCH_ZYNQ=y +CONFIG_TRUSTED_FOUNDATIONS=y +CONFIG_PCI=y +CONFIG_PCI_HOST_GENERIC=y +CONFIG_PCI_KEYSTONE=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MVEBU=y +CONFIG_PCI_TEGRA=y +CONFIG_PCI_RCAR_GEN2=y +CONFIG_PCI_RCAR_GEN2_PCIE=y +CONFIG_PCIEPORTBUS=y +CONFIG_SMP=y +CONFIG_NR_CPUS=16 +CONFIG_HIGHPTE=y +CONFIG_CMA=y +CONFIG_ARM_APPENDED_DTB=y +CONFIG_ARM_ATAG_DTB_COMPAT=y +CONFIG_KEXEC=y +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_STAT_DETAILS=y +CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND=y +CONFIG_CPU_IDLE=y +CONFIG_ARM_CPUIDLE=y +CONFIG_NEON=y +CONFIG_KERNEL_MODE_NEON=y +CONFIG_ARM_ZYNQ_CPUIDLE=y +CONFIG_ARM_EXYNOS_CPUIDLE=y +CONFIG_NET=y +CONFIG_PACKET=y +CONFIG_UNIX=y +CONFIG_INET=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_CAN=y +CONFIG_CAN_RAW=y +CONFIG_CAN_BCM=y +CONFIG_CAN_DEV=y +CONFIG_CAN_AT91=m +CONFIG_CAN_XILINXCAN=y +CONFIG_CAN_MCP251X=y +CONFIG_CAN_SUN4I=y +CONFIG_BT=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_CFG80211=m +CONFIG_MAC80211=m +CONFIG_RFKILL=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=y +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_DMA_CMA=y +CONFIG_CMA_SIZE_MBYTES=64 +CONFIG_OMAP_OCP2SCP=y +CONFIG_SIMPLE_PM_BUS=y +CONFIG_MTD=y +CONFIG_MTD_CMDLINE_PARTS=y +CONFIG_MTD_BLOCK=y +CONFIG_MTD_M25P80=y +CONFIG_MTD_NAND=y +CONFIG_MTD_NAND_ATMEL=y +CONFIG_MTD_NAND_BRCMNAND=y +CONFIG_MTD_NAND_DAVINCI=y +CONFIG_MTD_SPI_NOR=y +CONFIG_MTD_UBI=y +CONFIG_BLK_DEV_LOOP=y +CONFIG_AD525X_DPOT=y +CONFIG_AD525X_DPOT_I2C=y +CONFIG_ATMEL_TCLIB=y +CONFIG_ICS932S401=y +CONFIG_ATMEL_SSC=m +CONFIG_APDS9802ALS=y +CONFIG_ISL29003=y +CONFIG_EEPROM_AT24=y +CONFIG_EEPROM_SUNXI_SID=y +CONFIG_BLK_DEV_SD=y +CONFIG_BLK_DEV_SR=y +CONFIG_SCSI_MULTI_LUN=y +CONFIG_ATA=y +CONFIG_SATA_AHCI=y +CONFIG_SATA_AHCI_PLATFORM=y +CONFIG_AHCI_ST=y +CONFIG_AHCI_SUNXI=y +CONFIG_AHCI_TEGRA=y +CONFIG_SATA_HIGHBANK=y +CONFIG_SATA_MV=y +CONFIG_SATA_RCAR=y +CONFIG_NETDEVICES=y +CONFIG_HIX5HD2_GMAC=y +CONFIG_SUN4I_EMAC=y +CONFIG_MACB=y +CONFIG_NET_CALXEDA_XGMAC=y +CONFIG_IGB=y +CONFIG_MV643XX_ETH=y +CONFIG_MVNETA=y +CONFIG_PXA168_ETH=m +CONFIG_KS8851=y +CONFIG_R8169=y +CONFIG_SH_ETH=y +CONFIG_SMSC911X=y +CONFIG_STMMAC_ETH=y +CONFIG_TI_CPSW=y +CONFIG_XILINX_EMACLITE=y +CONFIG_AT803X_PHY=y +CONFIG_MARVELL_PHY=y +CONFIG_SMSC_PHY=y +CONFIG_BROADCOM_PHY=y +CONFIG_ICPLUS_PHY=y +CONFIG_MICREL_PHY=y +CONFIG_FIXED_PHY=y +CONFIG_USB_PEGASUS=y +CONFIG_USB_RTL8152=m +CONFIG_USB_USBNET=y +CONFIG_USB_NET_SMSC75XX=y +CONFIG_USB_NET_SMSC95XX=y +CONFIG_BRCMFMAC=m +CONFIG_RT2X00=m +CONFIG_RT2800USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_INPUT_JOYDEV=y +CONFIG_INPUT_EVDEV=y +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_GPIO=y +CONFIG_KEYBOARD_TEGRA=y +CONFIG_KEYBOARD_SPEAR=y +CONFIG_KEYBOARD_ST_KEYSCAN=y +CONFIG_KEYBOARD_CROS_EC=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=y +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_ATMEL_MXT=y +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMPE=y +CONFIG_TOUCHSCREEN_SUN4I=y +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_MPU3050=y +CONFIG_INPUT_AXP20X_PEK=y +CONFIG_INPUT_ADXL34X=m +CONFIG_SERIO_AMBAKMI=y +CONFIG_SERIAL_8250=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DW=y +CONFIG_SERIAL_8250_EM=y +CONFIG_SERIAL_8250_MT6577=y +CONFIG_SERIAL_8250_UNIPHIER=y +CONFIG_SERIAL_AMBA_PL011=y +CONFIG_SERIAL_AMBA_PL011_CONSOLE=y +CONFIG_SERIAL_ATMEL=y +CONFIG_SERIAL_ATMEL_CONSOLE=y +CONFIG_SERIAL_ATMEL_TTYAT=y +CONFIG_SERIAL_MESON=y +CONFIG_SERIAL_MESON_CONSOLE=y +CONFIG_SERIAL_SAMSUNG=y +CONFIG_SERIAL_SAMSUNG_CONSOLE=y +CONFIG_SERIAL_SIRFSOC=y +CONFIG_SERIAL_SIRFSOC_CONSOLE=y +CONFIG_SERIAL_TEGRA=y +CONFIG_SERIAL_IMX=y +CONFIG_SERIAL_IMX_CONSOLE=y +CONFIG_SERIAL_SH_SCI=y +CONFIG_SERIAL_SH_SCI_NR_UARTS=20 +CONFIG_SERIAL_SH_SCI_CONSOLE=y +CONFIG_SERIAL_MSM=y +CONFIG_SERIAL_MSM_CONSOLE=y +CONFIG_SERIAL_VT8500=y +CONFIG_SERIAL_VT8500_CONSOLE=y +CONFIG_SERIAL_OF_PLATFORM=y +CONFIG_SERIAL_OMAP=y +CONFIG_SERIAL_OMAP_CONSOLE=y +CONFIG_SERIAL_XILINX_PS_UART=y +CONFIG_SERIAL_XILINX_PS_UART_CONSOLE=y +CONFIG_SERIAL_FSL_LPUART=y +CONFIG_SERIAL_FSL_LPUART_CONSOLE=y +CONFIG_SERIAL_CONEXANT_DIGICOLOR=y +CONFIG_SERIAL_CONEXANT_DIGICOLOR_CONSOLE=y +CONFIG_SERIAL_ST_ASC=y +CONFIG_SERIAL_ST_ASC_CONSOLE=y +CONFIG_I2C_CHARDEV=y +CONFIG_I2C_DAVINCI=y +CONFIG_I2C_MUX=y +CONFIG_I2C_ARB_GPIO_CHALLENGE=m +CONFIG_I2C_MUX_PCA954x=y +CONFIG_I2C_MUX_PINCTRL=y +CONFIG_I2C_AT91=m +CONFIG_I2C_CADENCE=y +CONFIG_I2C_DESIGNWARE_PLATFORM=y +CONFIG_I2C_DIGICOLOR=m +CONFIG_I2C_GPIO=m +CONFIG_I2C_EXYNOS5=y +CONFIG_I2C_MV64XXX=y +CONFIG_I2C_RIIC=y +CONFIG_I2C_RK3X=y +CONFIG_I2C_S3C2410=y +CONFIG_I2C_SH_MOBILE=y +CONFIG_I2C_SIRF=y +CONFIG_I2C_ST=y +CONFIG_I2C_SUN6I_P2WI=y +CONFIG_I2C_TEGRA=y +CONFIG_I2C_UNIPHIER=y +CONFIG_I2C_UNIPHIER_F=y +CONFIG_I2C_XILINX=y +CONFIG_I2C_RCAR=y +CONFIG_I2C_CROS_EC_TUNNEL=m +CONFIG_SPI=y +CONFIG_SPI_ATMEL=m +CONFIG_SPI_CADENCE=y +CONFIG_SPI_DAVINCI=y +CONFIG_SPI_OMAP24XX=y +CONFIG_SPI_ORION=y +CONFIG_SPI_PL022=y +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_RSPI=y +CONFIG_SPI_S3C64XX=m +CONFIG_SPI_SH_MSIOF=m +CONFIG_SPI_SH_HSPI=y +CONFIG_SPI_SIRF=y +CONFIG_SPI_SUN4I=y +CONFIG_SPI_SUN6I=y +CONFIG_SPI_TEGRA114=y +CONFIG_SPI_TEGRA20_SFLASH=y +CONFIG_SPI_TEGRA20_SLINK=y +CONFIG_SPI_XILINX=y +CONFIG_SPI_SPIDEV=y +CONFIG_PINCTRL_AS3722=y +CONFIG_PINCTRL_PALMAS=y +CONFIG_PINCTRL_APQ8064=y +CONFIG_PINCTRL_APQ8084=y +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC_PLATFORM=y +CONFIG_GPIO_DAVINCI=y +CONFIG_GPIO_DWAPB=y +CONFIG_GPIO_EM=y +CONFIG_GPIO_RCAR=y +CONFIG_GPIO_XILINX=y +CONFIG_GPIO_ZYNQ=y +CONFIG_GPIO_PCA953X=y +CONFIG_GPIO_PCA953X_IRQ=y +CONFIG_GPIO_PCF857X=y +CONFIG_GPIO_TWL4030=y +CONFIG_GPIO_PALMAS=y +CONFIG_GPIO_SYSCON=y +CONFIG_GPIO_TPS6586X=y +CONFIG_GPIO_TPS65910=y +CONFIG_BATTERY_SBS=y +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_TPS65090=y +CONFIG_AXP20X_POWER=m +CONFIG_POWER_RESET_AS3722=y +CONFIG_POWER_RESET_GPIO=y +CONFIG_POWER_RESET_GPIO_RESTART=y +CONFIG_POWER_RESET_KEYSTONE=y +CONFIG_POWER_RESET_RMOBILE=y +CONFIG_SENSORS_LM90=y +CONFIG_SENSORS_LM95245=y +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_THERMAL=y +CONFIG_CPU_THERMAL=y +CONFIG_ROCKCHIP_THERMAL=y +CONFIG_RCAR_THERMAL=y +CONFIG_ARMADA_THERMAL=y +CONFIG_DAVINCI_WATCHDOG=m +CONFIG_EXYNOS_THERMAL=m +CONFIG_ST_THERMAL_SYSCFG=y +CONFIG_ST_THERMAL_MEMMAP=y +CONFIG_WATCHDOG=y +CONFIG_XILINX_WATCHDOG=y +CONFIG_ARM_SP805_WATCHDOG=y +CONFIG_ORION_WATCHDOG=y +CONFIG_ST_LPC_WATCHDOG=y +CONFIG_SUNXI_WATCHDOG=y +CONFIG_TEGRA_WATCHDOG=m +CONFIG_MESON_WATCHDOG=y +CONFIG_DIGICOLOR_WATCHDOG=y +CONFIG_MFD_AS3711=y +CONFIG_MFD_AS3722=y +CONFIG_MFD_ATMEL_FLEXCOM=y +CONFIG_MFD_BCM590XX=y +CONFIG_MFD_AXP20X=y +CONFIG_MFD_CROS_EC=y +CONFIG_MFD_CROS_EC_I2C=m +CONFIG_MFD_CROS_EC_SPI=y +CONFIG_MFD_MAX14577=y +CONFIG_MFD_MAX77686=y +CONFIG_MFD_MAX77693=y +CONFIG_MFD_MAX8907=y +CONFIG_MFD_RK808=y +CONFIG_MFD_PM8921_CORE=y +CONFIG_MFD_QCOM_RPM=y +CONFIG_MFD_SEC_CORE=y +CONFIG_MFD_STMPE=y +CONFIG_MFD_PALMAS=y +CONFIG_MFD_TPS65090=y +CONFIG_MFD_TPS6586X=y +CONFIG_MFD_TPS65910=y +CONFIG_REGULATOR_AB8500=y +CONFIG_REGULATOR_ACT8865=y +CONFIG_REGULATOR_AS3711=y +CONFIG_REGULATOR_AS3722=y +CONFIG_REGULATOR_AXP20X=y +CONFIG_REGULATOR_BCM590XX=y +CONFIG_REGULATOR_DA9210=y +CONFIG_REGULATOR_FAN53555=y +CONFIG_REGULATOR_RK808=y +CONFIG_REGULATOR_GPIO=y +CONFIG_MFD_SYSCON=y +CONFIG_POWER_RESET_SYSCON=y +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX8907=y +CONFIG_REGULATOR_MAX8973=y +CONFIG_REGULATOR_MAX77686=y +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MAX77802=m +CONFIG_REGULATOR_PALMAS=y +CONFIG_REGULATOR_PBIAS=y +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_RPM=y +CONFIG_REGULATOR_QCOM_SMD_RPM=y +CONFIG_REGULATOR_S2MPS11=y +CONFIG_REGULATOR_S5M8767=y +CONFIG_REGULATOR_TPS51632=y +CONFIG_REGULATOR_TPS62360=y +CONFIG_REGULATOR_TPS65090=y +CONFIG_REGULATOR_TPS6586X=y +CONFIG_REGULATOR_TPS65910=y +CONFIG_REGULATOR_TWL4030=y +CONFIG_REGULATOR_VEXPRESS=y +CONFIG_MEDIA_SUPPORT=m +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_CONTROLLER=y +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_MEDIA_USB_SUPPORT=y +CONFIG_USB_VIDEO_CLASS=y +CONFIG_USB_GSPCA=y +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_SOC_CAMERA=m +CONFIG_SOC_CAMERA_PLATFORM=m +CONFIG_VIDEO_RCAR_VIN=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_RENESAS_VSP1=m +# CONFIG_MEDIA_SUBDRV_AUTOSELECT is not set +CONFIG_VIDEO_ADV7180=m +CONFIG_VIDEO_ML86V7667=m +CONFIG_DRM=y +CONFIG_DRM_I2C_ADV7511=m +# CONFIG_DRM_I2C_CH7006 is not set +# CONFIG_DRM_I2C_SIL164 is not set +CONFIG_DRM_NXP_PTN3460=m +CONFIG_DRM_PARADE_PS8622=m +CONFIG_DRM_NOUVEAU=m +CONFIG_DRM_EXYNOS=m +CONFIG_DRM_EXYNOS_DSI=y +CONFIG_DRM_EXYNOS_FIMD=y +CONFIG_DRM_EXYNOS_HDMI=y +CONFIG_DRM_ROCKCHIP=m +CONFIG_ROCKCHIP_DW_HDMI=m +CONFIG_DRM_RCAR_DU=m +CONFIG_DRM_RCAR_HDMI=y +CONFIG_DRM_RCAR_LVDS=y +CONFIG_DRM_TEGRA=y +CONFIG_DRM_PANEL_SAMSUNG_S6E8AA0=m +CONFIG_DRM_PANEL_SIMPLE=y +CONFIG_FB_ARMCLCD=y +CONFIG_FB_WM8505=y +CONFIG_FB_SH_MOBILE_LCDC=y +CONFIG_FB_SIMPLE=y +CONFIG_FB_SH_MOBILE_MERAM=y +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_LCD_PLATFORM=m +CONFIG_BACKLIGHT_PWM=y +CONFIG_BACKLIGHT_AS3711=y +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +CONFIG_SOUND=m +CONFIG_SND=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_HDA_TEGRA=m +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_USB_AUDIO=y +CONFIG_SND_SOC=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_ATMEL_SOC_WM8904=m +CONFIG_SND_SOC_SH4_FSI=m +CONFIG_SND_SOC_RCAR=m +CONFIG_SND_SOC_RSRC_CARD=m +CONFIG_SND_SOC_TEGRA=m +CONFIG_SND_SOC_TEGRA_RT5640=m +CONFIG_SND_SOC_TEGRA_WM8753=m +CONFIG_SND_SOC_TEGRA_WM8903=m +CONFIG_SND_SOC_TEGRA_WM9712=m +CONFIG_SND_SOC_TEGRA_TRIMSLICE=m +CONFIG_SND_SOC_TEGRA_ALC5632=m +CONFIG_SND_SOC_TEGRA_MAX98090=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_WM8978=m +CONFIG_USB=y +CONFIG_USB_XHCI_HCD=y +CONFIG_USB_XHCI_MVEBU=y +CONFIG_USB_EHCI_HCD=y +CONFIG_USB_EHCI_MSM=m +CONFIG_USB_EHCI_EXYNOS=y +CONFIG_USB_EHCI_TEGRA=y +CONFIG_USB_EHCI_HCD_STI=y +CONFIG_USB_EHCI_HCD_PLATFORM=y +CONFIG_USB_ISP1760=y +CONFIG_USB_OHCI_HCD=y +CONFIG_USB_OHCI_HCD_STI=y +CONFIG_USB_OHCI_HCD_PLATFORM=y +CONFIG_USB_OHCI_EXYNOS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_RENESAS_USBHS=m +CONFIG_USB_STORAGE=y +CONFIG_USB_DWC3=y +CONFIG_USB_DWC2=m +CONFIG_USB_CHIPIDEA=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_AB8500_USB=y +CONFIG_KEYSTONE_USB_PHY=y +CONFIG_OMAP_USB3=y +CONFIG_USB_GPIO_VBUS=y +CONFIG_USB_ISP1301=y +CONFIG_USB_MSM_OTG=m +CONFIG_USB_MXS_PHY=y +CONFIG_USB_RCAR_PHY=m +CONFIG_USB_GADGET=y +CONFIG_USB_RENESAS_USBHS_UDC=m +CONFIG_USB_ETH=m +CONFIG_MMC=y +CONFIG_MMC_BLOCK_MINORS=16 +CONFIG_MMC_ARMMMCI=y +CONFIG_MMC_SDHCI=y +CONFIG_MMC_SDHCI_PLTFM=y +CONFIG_MMC_SDHCI_OF_ARASAN=y +CONFIG_MMC_SDHCI_OF_AT91=y +CONFIG_MMC_SDHCI_ESDHC_IMX=y +CONFIG_MMC_SDHCI_DOVE=y +CONFIG_MMC_SDHCI_TEGRA=y +CONFIG_MMC_SDHCI_PXAV3=y +CONFIG_MMC_SDHCI_SPEAR=y +CONFIG_MMC_SDHCI_S3C=y +CONFIG_MMC_SDHCI_S3C_DMA=y +CONFIG_MMC_SDHCI_BCM_KONA=y +CONFIG_MMC_SDHCI_ST=y +CONFIG_MMC_OMAP=y +CONFIG_MMC_OMAP_HS=y +CONFIG_MMC_ATMELMCI=y +CONFIG_MMC_MVSDIO=y +CONFIG_MMC_SDHI=y +CONFIG_MMC_DW=y +CONFIG_MMC_DW_IDMAC=y +CONFIG_MMC_DW_PLTFM=y +CONFIG_MMC_DW_EXYNOS=y +CONFIG_MMC_DW_ROCKCHIP=y +CONFIG_MMC_SH_MMCIF=y +CONFIG_MMC_SUNXI=y +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_GPIO=y +CONFIG_LEDS_PWM=y +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=y +CONFIG_LEDS_TRIGGER_ONESHOT=y +CONFIG_LEDS_TRIGGER_HEARTBEAT=y +CONFIG_LEDS_TRIGGER_BACKLIGHT=y +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_GPIO=y +CONFIG_LEDS_TRIGGER_DEFAULT_ON=y +CONFIG_LEDS_TRIGGER_TRANSIENT=y +CONFIG_LEDS_TRIGGER_CAMERA=y +CONFIG_EDAC=y +CONFIG_EDAC_MM_EDAC=y +CONFIG_EDAC_HIGHBANK_MC=y +CONFIG_EDAC_HIGHBANK_L2=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_DRV_AS3722=y +CONFIG_RTC_DRV_DS1307=y +CONFIG_RTC_DRV_HYM8563=m +CONFIG_RTC_DRV_MAX8907=y +CONFIG_RTC_DRV_MAX77686=y +CONFIG_RTC_DRV_RK808=m +CONFIG_RTC_DRV_MAX77802=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_PALMAS=y +CONFIG_RTC_DRV_ST_LPC=y +CONFIG_RTC_DRV_TWL4030=y +CONFIG_RTC_DRV_TPS6586X=y +CONFIG_RTC_DRV_TPS65910=y +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_EM3027=y +CONFIG_RTC_DRV_DIGICOLOR=m +CONFIG_RTC_DRV_S5M=m +CONFIG_RTC_DRV_S3C=m +CONFIG_RTC_DRV_PL031=y +CONFIG_RTC_DRV_AT91RM9200=m +CONFIG_RTC_DRV_AT91SAM9=m +CONFIG_RTC_DRV_VT8500=y +CONFIG_RTC_DRV_SUN6I=y +CONFIG_RTC_DRV_SUNXI=y +CONFIG_RTC_DRV_MV=y +CONFIG_RTC_DRV_TEGRA=y +CONFIG_DMADEVICES=y +CONFIG_DW_DMAC=y +CONFIG_AT_HDMAC=y +CONFIG_AT_XDMAC=y +CONFIG_MV_XOR=y +CONFIG_TEGRA20_APB_DMA=y +CONFIG_SH_DMAE=y +CONFIG_RCAR_DMAC=y +CONFIG_STE_DMA40=y +CONFIG_SIRF_DMA=y +CONFIG_TI_EDMA=y +CONFIG_PL330_DMA=y +CONFIG_IMX_SDMA=y +CONFIG_IMX_DMA=y +CONFIG_MXS_DMA=y +CONFIG_DMA_OMAP=y +CONFIG_QCOM_BAM_DMA=y +CONFIG_XILINX_VDMA=y +CONFIG_DMA_SUN6I=y +CONFIG_STAGING=y +CONFIG_SENSORS_ISL29018=y +CONFIG_SENSORS_ISL29028=y +CONFIG_MFD_NVEC=y +CONFIG_KEYBOARD_NVEC=y +CONFIG_SERIO_NVEC_PS2=y +CONFIG_NVEC_POWER=y +CONFIG_NVEC_PAZ00=y +CONFIG_QCOM_GSBI=y +CONFIG_QCOM_PM=y +CONFIG_QCOM_SMD=y +CONFIG_QCOM_SMD_RPM=y +CONFIG_QCOM_SMEM=y +CONFIG_COMMON_CLK_QCOM=y +CONFIG_CHROME_PLATFORMS=y +CONFIG_CROS_EC_CHARDEV=m +CONFIG_COMMON_CLK_MAX77686=y +CONFIG_COMMON_CLK_MAX77802=m +CONFIG_COMMON_CLK_S2MPS11=m +CONFIG_APQ_MMCC_8084=y +CONFIG_MSM_GCC_8660=y +CONFIG_MSM_MMCC_8960=y +CONFIG_MSM_MMCC_8974=y +CONFIG_HWSPINLOCK_QCOM=y +CONFIG_ROCKCHIP_IOMMU=y +CONFIG_TEGRA_IOMMU_GART=y +CONFIG_TEGRA_IOMMU_SMMU=y +CONFIG_PM_DEVFREQ=y +CONFIG_ARM_TEGRA_DEVFREQ=m +CONFIG_MEMORY=y +CONFIG_EXTCON=y +CONFIG_TI_AEMIF=y +CONFIG_IIO=y +CONFIG_AT91_ADC=m +CONFIG_BERLIN2_ADC=m +CONFIG_EXYNOS_ADC=m +CONFIG_XILINX_XADC=y +CONFIG_AK8975=y +CONFIG_PWM=y +CONFIG_PWM_ATMEL=m +CONFIG_PWM_ATMEL_TCB=m +CONFIG_PWM_RENESAS_TPU=y +CONFIG_PWM_ROCKCHIP=m +CONFIG_PWM_SAMSUNG=m +CONFIG_PWM_SUN4I=y +CONFIG_PWM_TEGRA=y +CONFIG_PWM_VT8500=y +CONFIG_PHY_HIX5HD2_SATA=y +CONFIG_PWM_STI=m +CONFIG_OMAP_USB2=y +CONFIG_TI_PIPE3=y +CONFIG_PHY_BERLIN_USB=y +CONFIG_PHY_BERLIN_SATA=y +CONFIG_PHY_ROCKCHIP_USB=m +CONFIG_PHY_QCOM_APQ8064_SATA=m +CONFIG_PHY_MIPHY28LP=y +CONFIG_PHY_MIPHY365X=y +CONFIG_PHY_RCAR_GEN2=m +CONFIG_PHY_STIH41X_USB=y +CONFIG_PHY_STIH407_USB=y +CONFIG_PHY_SUN4I_USB=y +CONFIG_PHY_SUN9I_USB=y +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_EXT4_FS=y +CONFIG_AUTOFS4_FS=y +CONFIG_MSDOS_FS=y +CONFIG_VFAT_FS=y +CONFIG_NTFS_FS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_UBIFS_FS=y +CONFIG_TMPFS=y +CONFIG_SQUASHFS=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_NFS_FS=y +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=y +CONFIG_ROOT_NFS=y +CONFIG_NLS_CODEPAGE_437=y +CONFIG_NLS_ISO8859_1=y +CONFIG_NLS_UTF8=y +CONFIG_PRINTK_TIME=y +CONFIG_DEBUG_FS=y +CONFIG_MAGIC_SYSRQ=y +CONFIG_LOCKUP_DETECTOR=y +CONFIG_CRYPTO_DEV_TEGRA_AES=y +CONFIG_CPUFREQ_DT=y +CONFIG_KEYSTONE_IRQ=y +CONFIG_CRYPTO_DEV_SUN4I_SS=m +CONFIG_ARM_CRYPTO=y +CONFIG_CRYPTO_SHA1_ARM=m +CONFIG_CRYPTO_SHA1_ARM_NEON=m +CONFIG_CRYPTO_SHA1_ARM_CE=m +CONFIG_CRYPTO_SHA2_ARM_CE=m +CONFIG_CRYPTO_SHA256_ARM=m +CONFIG_CRYPTO_SHA512_ARM=m +CONFIG_CRYPTO_AES_ARM=m +CONFIG_CRYPTO_AES_ARM_BS=m +CONFIG_CRYPTO_AES_ARM_CE=m +CONFIG_CRYPTO_GHASH_ARM_CE=m +CONFIG_CRYPTO_DEV_ATMEL_AES=m +CONFIG_CRYPTO_DEV_ATMEL_TDES=m +CONFIG_CRYPTO_DEV_ATMEL_SHA=m +CONFIG_HIBERNATION=y +CONFIG_KPROBES=y +CONFIG_CORESIGHT=y +CONFIG_RANDOMIZE_BASE=y From e119fc492de91bf13535a61b67a199f92c391cf5 Mon Sep 17 00:00:00 2001 From: James Hogan Date: Sat, 12 Aug 2017 21:36:09 -0700 Subject: [PATCH 094/546] irqchip: mips-gic: SYNC after enabling GIC region commit 2c0e8382386f618c85d20cb05e7cf7df8cdd382c upstream. A SYNC is required between enabling the GIC region and actually trying to use it, even if the first access is a read, otherwise its possible depending on the timing (and in my case depending on the precise alignment of certain kernel code) to hit CM bus errors on that first access. Add the SYNC straight after setting the GIC base. [paul.burton@imgtec.com: Changes later in this series increase our likelihood of hitting this by reducing the amount of code that runs between enabling the GIC & accessing it.] Fixes: a7057270c280 ("irqchip: mips-gic: Add device-tree support") Signed-off-by: James Hogan Signed-off-by: Paul Burton Acked-by: Marc Zyngier Cc: Thomas Gleixner Cc: Jason Cooper Cc: James Hogan Cc: linux-kernel@vger.kernel.org Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17019/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- drivers/irqchip/irq-mips-gic.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/irqchip/irq-mips-gic.c b/drivers/irqchip/irq-mips-gic.c index 9e17ef27a183..6f1dbd52ec91 100644 --- a/drivers/irqchip/irq-mips-gic.c +++ b/drivers/irqchip/irq-mips-gic.c @@ -915,8 +915,11 @@ static int __init gic_of_init(struct device_node *node, gic_len = resource_size(&res); } - if (mips_cm_present()) + if (mips_cm_present()) { write_gcr_gic_base(gic_base | CM_GCR_GIC_BASE_GICEN_MSK); + /* Ensure GIC region is enabled before trying to access it */ + __sync(); + } gic_present = true; __gic_init(gic_base, gic_len, cpu_vec, 0, node); From fab3229af4e589d6a06201e14e2f3bfcc95ad9af Mon Sep 17 00:00:00 2001 From: Stephen Douthit Date: Mon, 7 Aug 2017 17:10:59 -0400 Subject: [PATCH 095/546] i2c: ismt: Don't duplicate the receive length for block reads commit b6c159a9cb69c2cf0bf59d4e12c3a2da77e4d994 upstream. According to Table 15-14 of the C2000 EDS (Intel doc #510524) the rx data pointed to by the descriptor dptr contains the byte count. desc->rxbytes reports all bytes read on the wire, including the "byte count" byte. So if a device sends 4 bytes in response to a block read, on the wire and in the DMA buffer we see: count data1 data2 data3 data4 0x04 0xde 0xad 0xbe 0xef That's what we want to return in data->block to the next level. Instead we were actually prefixing that with desc->rxbytes: bad count count data1 data2 data3 data4 0x05 0x04 0xde 0xad 0xbe 0xef This was discovered while developing a BMC solution relying on the ipmi_ssif.c driver which was trying to interpret the bogus length field as part of the IPMI response. Signed-off-by: Stephen Douthit Tested-by: Dan Priamo Acked-by: Neil Horman Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-ismt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index 7ba795b24e75..f22b0e5778dd 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -339,8 +339,8 @@ static int ismt_process_desc(const struct ismt_desc *desc, break; case I2C_SMBUS_BLOCK_DATA: case I2C_SMBUS_I2C_BLOCK_DATA: - memcpy(&data->block[1], dma_buffer, desc->rxbytes); - data->block[0] = desc->rxbytes; + memcpy(data->block, dma_buffer, desc->rxbytes); + data->block[0] = desc->rxbytes - 1; break; } return 0; From 043ccc9781cc2e2914253c1c6f17923905c83a4e Mon Sep 17 00:00:00 2001 From: Stephen Douthit Date: Mon, 7 Aug 2017 17:11:00 -0400 Subject: [PATCH 096/546] i2c: ismt: Return EMSGSIZE for block reads with bogus length commit ba201c4f5ebe13d7819081756378777d8153f23e upstream. Compare the number of bytes actually seen on the wire to the byte count field returned by the slave device. Previously we just overwrote the byte count returned by the slave with the real byte count and let the caller figure out if the message was sane. Signed-off-by: Stephen Douthit Tested-by: Dan Priamo Acked-by: Neil Horman Signed-off-by: Wolfram Sang Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-ismt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-ismt.c b/drivers/i2c/busses/i2c-ismt.c index f22b0e5778dd..639d1a9c8793 100644 --- a/drivers/i2c/busses/i2c-ismt.c +++ b/drivers/i2c/busses/i2c-ismt.c @@ -339,8 +339,10 @@ static int ismt_process_desc(const struct ismt_desc *desc, break; case I2C_SMBUS_BLOCK_DATA: case I2C_SMBUS_I2C_BLOCK_DATA: + if (desc->rxbytes != dma_buffer[0] + 1) + return -EMSGSIZE; + memcpy(data->block, dma_buffer, desc->rxbytes); - data->block[0] = desc->rxbytes - 1; break; } return 0; From 857d0b3dd7566ebb70686a009c7100322ddf3bbe Mon Sep 17 00:00:00 2001 From: "Yan, Zheng" Date: Fri, 4 Aug 2017 11:22:31 +0800 Subject: [PATCH 097/546] ceph: fix readpage from fscache commit dd2bc473482eedc60c29cf00ad12568ce40ce511 upstream. ceph_readpage() unlocks page prematurely prematurely in the case that page is reading from fscache. Caller of readpage expects that page is uptodate when it get unlocked. So page shoule get locked by completion callback of fscache_read_or_alloc_pages() Signed-off-by: "Yan, Zheng" Reviewed-by: Jeff Layton Signed-off-by: Ilya Dryomov Signed-off-by: Greg Kroah-Hartman --- fs/ceph/addr.c | 24 +++++++++++++++--------- fs/ceph/cache.c | 12 +++--------- 2 files changed, 18 insertions(+), 18 deletions(-) diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index c6a1ec110c01..22bae2b434e2 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -189,7 +189,7 @@ static int ceph_releasepage(struct page *page, gfp_t g) /* * read a single page, without unlocking it. */ -static int readpage_nounlock(struct file *filp, struct page *page) +static int ceph_do_readpage(struct file *filp, struct page *page) { struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); @@ -219,7 +219,7 @@ static int readpage_nounlock(struct file *filp, struct page *page) err = ceph_readpage_from_fscache(inode, page); if (err == 0) - goto out; + return -EINPROGRESS; dout("readpage inode %p file %p page %p index %lu\n", inode, filp, page, page->index); @@ -249,8 +249,11 @@ static int readpage_nounlock(struct file *filp, struct page *page) static int ceph_readpage(struct file *filp, struct page *page) { - int r = readpage_nounlock(filp, page); - unlock_page(page); + int r = ceph_do_readpage(filp, page); + if (r != -EINPROGRESS) + unlock_page(page); + else + r = 0; return r; } @@ -1094,7 +1097,7 @@ static int ceph_update_writeable_page(struct file *file, goto retry_locked; r = writepage_nounlock(page, NULL); if (r < 0) - goto fail_nosnap; + goto fail_unlock; goto retry_locked; } @@ -1122,11 +1125,14 @@ static int ceph_update_writeable_page(struct file *file, } /* we need to read it. */ - r = readpage_nounlock(file, page); - if (r < 0) - goto fail_nosnap; + r = ceph_do_readpage(file, page); + if (r < 0) { + if (r == -EINPROGRESS) + return -EAGAIN; + goto fail_unlock; + } goto retry_locked; -fail_nosnap: +fail_unlock: unlock_page(page); return r; } diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1ba7..ff1cfd7b1083 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -224,13 +224,7 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) fscache_relinquish_cookie(cookie, 0); } -static void ceph_vfs_readpage_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); -} - -static void ceph_vfs_readpage_complete_unlock(struct page *page, void *data, int error) +static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) { if (!error) SetPageUptodate(page); @@ -259,7 +253,7 @@ int ceph_readpage_from_fscache(struct inode *inode, struct page *page) return -ENOBUFS; ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_vfs_readpage_complete, NULL, + ceph_readpage_from_fscache_complete, NULL, GFP_KERNEL); switch (ret) { @@ -288,7 +282,7 @@ int ceph_readpages_from_fscache(struct inode *inode, return -ENOBUFS; ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_vfs_readpage_complete_unlock, + ceph_readpage_from_fscache_complete, NULL, mapping_gfp_mask(mapping)); switch (ret) { From 15e94ec4ec2155cae1e0de64b5ea16277c21daf8 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Mon, 28 Aug 2017 14:51:27 -0700 Subject: [PATCH 098/546] cpumask: fix spurious cpumask_of_node() on non-NUMA multi-node configs commit b339752d054fb32863418452dff350a1086885b1 upstream. When !NUMA, cpumask_of_node(@node) equals cpu_online_mask regardless of @node. The assumption seems that if !NUMA, there shouldn't be more than one node and thus reporting cpu_online_mask regardless of @node is correct. However, that assumption was broken years ago to support DISCONTIGMEM and whether a system has multiple nodes or not is separately controlled by NEED_MULTIPLE_NODES. This means that, on a system with !NUMA && NEED_MULTIPLE_NODES, cpumask_of_node() will report cpu_online_mask for all possible nodes, indicating that the CPUs are associated with multiple nodes which is an impossible configuration. This bug has been around forever but doesn't look like it has caused any noticeable symptoms. However, it triggers a WARN recently added to workqueue to verify NUMA affinity configuration. Fix it by reporting empty cpumask on non-zero nodes if !NUMA. Signed-off-by: Tejun Heo Reported-and-tested-by: Geert Uytterhoeven Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- include/asm-generic/topology.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/include/asm-generic/topology.h b/include/asm-generic/topology.h index fc824e2828f3..5d2add1a6c96 100644 --- a/include/asm-generic/topology.h +++ b/include/asm-generic/topology.h @@ -48,7 +48,11 @@ #define parent_node(node) ((void)(node),0) #endif #ifndef cpumask_of_node -#define cpumask_of_node(node) ((void)node, cpu_online_mask) + #ifdef CONFIG_NEED_MULTIPLE_NODES + #define cpumask_of_node(node) ((node) == 0 ? cpu_online_mask : cpu_none_mask) + #else + #define cpumask_of_node(node) ((void)node, cpu_online_mask) + #endif #endif #ifndef pcibus_to_node #define pcibus_to_node(bus) ((void)(bus), -1) From ed48d9230e303aaff037d60fd866088c114184e5 Mon Sep 17 00:00:00 2001 From: Waiman Long Date: Thu, 24 Aug 2017 12:04:29 -0400 Subject: [PATCH 099/546] cpuset: Fix incorrect memory_pressure control file mapping commit 1c08c22c874ac88799cab1f78c40f46110274915 upstream. The memory_pressure control file was incorrectly set up without a private value (0, by default). As a result, this control file was treated like memory_migrate on read. By adding back the FILE_MEMORY_PRESSURE private value, the correct memory pressure value will be returned. Signed-off-by: Waiman Long Signed-off-by: Tejun Heo Fixes: 7dbdb199d3bf ("cgroup: replace cftype->mode with CFTYPE_WORLD_WRITABLE") Signed-off-by: Greg Kroah-Hartman --- kernel/cpuset.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 8ccd66a97c8b..2924b6faa469 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -1910,6 +1910,7 @@ static struct cftype files[] = { { .name = "memory_pressure", .read_u64 = cpuset_read_u64, + .private = FILE_MEMORY_PRESSURE, }, { From 6cdda3497db80b8a218da3748b78012a22c397d0 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Thu, 1 Oct 2015 01:35:55 +0100 Subject: [PATCH 100/546] alpha: uapi: Add support for __SANE_USERSPACE_TYPES__ commit cec80d82142ab25c71eee24b529cfeaf17c43062 upstream. This fixes compiler errors in perf such as: tests/attr.c: In function 'store_event': tests/attr.c:66:27: error: format '%llu' expects argument of type 'long long unsigned int', but argument 6 has type '__u64 {aka long unsigned int}' [-Werror=format=] snprintf(path, PATH_MAX, "%s/event-%d-%llu-%d", dir, ^ Signed-off-by: Ben Hutchings Tested-by: Michael Cree Signed-off-by: Matt Turner Signed-off-by: Greg Kroah-Hartman --- arch/alpha/include/asm/types.h | 2 +- arch/alpha/include/uapi/asm/types.h | 12 +++++++++++- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/arch/alpha/include/asm/types.h b/arch/alpha/include/asm/types.h index 4cb4b6d3452c..0bc66e1d3a7e 100644 --- a/arch/alpha/include/asm/types.h +++ b/arch/alpha/include/asm/types.h @@ -1,6 +1,6 @@ #ifndef _ALPHA_TYPES_H #define _ALPHA_TYPES_H -#include +#include #endif /* _ALPHA_TYPES_H */ diff --git a/arch/alpha/include/uapi/asm/types.h b/arch/alpha/include/uapi/asm/types.h index 9fd3cd459777..8d1024d7be05 100644 --- a/arch/alpha/include/uapi/asm/types.h +++ b/arch/alpha/include/uapi/asm/types.h @@ -9,8 +9,18 @@ * need to be careful to avoid a name clashes. */ -#ifndef __KERNEL__ +/* + * This is here because we used to use l64 for alpha + * and we don't want to impact user mode with our change to ll64 + * in the kernel. + * + * However, some user programs are fine with this. They can + * flag __SANE_USERSPACE_TYPES__ to get int-ll64.h here. + */ +#if !defined(__SANE_USERSPACE_TYPES__) && !defined(__KERNEL__) #include +#else +#include #endif #endif /* _UAPI_ALPHA_TYPES_H */ From e596cc1454d4ca3ab1e233b9a89d3e44d7062f4e Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Thu, 24 Aug 2017 15:16:40 -0700 Subject: [PATCH 101/546] CIFS: Fix maximum SMB2 header size commit 9e37b1784f2be9397a903307574ee565bbadfd75 upstream. Currently the maximum size of SMB2/3 header is set incorrectly which leads to hanging of directory listing operations on encrypted SMB3 connections. Fix this by setting the maximum size to 170 bytes that is calculated as RFC1002 length field size (4) + transform header size (52) + SMB2 header size (64) + create response size (56). Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Acked-by: Sachin Prabhu Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index b8f553b32dda..aacb15bd56fe 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -82,8 +82,8 @@ #define NUMBER_OF_SMB2_COMMANDS 0x0013 -/* BB FIXME - analyze following length BB */ -#define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ +/* 4 len + 52 transform hdr + 64 hdr + 56 create rsp */ +#define MAX_SMB2_HDR_SIZE 0x00b0 #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) From 5f08f0aebf7084ad1c6359c22adb104fa006889c Mon Sep 17 00:00:00 2001 From: Steve French Date: Sun, 27 Aug 2017 16:56:08 -0500 Subject: [PATCH 102/546] CIFS: remove endian related sparse warning commit 6e3c1529c39e92ed64ca41d53abadabbaa1d5393 upstream. Recent patch had an endian warning ie cifs: return ENAMETOOLONG for overlong names in cifs_open()/cifs_lookup() Signed-off-by: Steve French CC: Ronnie Sahlberg Acked-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/dir.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index fa8df3fef6fc..297e05c9e2b0 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -194,7 +194,7 @@ check_name(struct dentry *direntry, struct cifs_tcon *tcon) int i; if (unlikely(direntry->d_name.len > - tcon->fsAttrInfo.MaxPathNameComponentLength)) + le32_to_cpu(tcon->fsAttrInfo.MaxPathNameComponentLength))) return -ENAMETOOLONG; if (!(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS)) { From c0c2e7567a34ca48c8a2d1c89d8a7a5ceb647e08 Mon Sep 17 00:00:00 2001 From: Cong Wang Date: Thu, 31 Aug 2017 16:47:43 +0200 Subject: [PATCH 103/546] wl1251: add a missing spin_lock_init() commit f581a0dd744fe32b0a8805e279c59ec1ac676d60 upstream. wl1251: add a missing spin_lock_init() This fixes the following kernel warning: [ 5668.771453] BUG: spinlock bad magic on CPU#0, kworker/u2:3/9745 [ 5668.771850] lock: 0xce63ef20, .magic: 00000000, .owner: /-1, .owner_cpu: 0 [ 5668.772277] CPU: 0 PID: 9745 Comm: kworker/u2:3 Tainted: G W 4.12.0-03002-gec979a4-dirty #40 [ 5668.772796] Hardware name: Nokia RX-51 board [ 5668.773071] Workqueue: phy1 wl1251_irq_work [ 5668.773345] [] (unwind_backtrace) from [] (show_stack+0x10/0x14) [ 5668.773803] [] (show_stack) from [] (do_raw_spin_lock+0x6c/0xa0) [ 5668.774230] [] (do_raw_spin_lock) from [] (_raw_spin_lock_irqsave+0x10/0x18) [ 5668.774658] [] (_raw_spin_lock_irqsave) from [] (wl1251_op_tx+0x38/0x5c) [ 5668.775115] [] (wl1251_op_tx) from [] (ieee80211_tx_frags+0x188/0x1c0) [ 5668.775543] [] (ieee80211_tx_frags) from [] (__ieee80211_tx+0x6c/0x130) [ 5668.775970] [] (__ieee80211_tx) from [] (ieee80211_tx+0xdc/0x104) [ 5668.776367] [] (ieee80211_tx) from [] (__ieee80211_subif_start_xmit+0x454/0x8c8) [ 5668.776824] [] (__ieee80211_subif_start_xmit) from [] (ieee80211_subif_start_xmit+0x30/0x2fc) [ 5668.777343] [] (ieee80211_subif_start_xmit) from [] (dev_hard_start_xmit+0x80/0x118) ... by adding the missing spin_lock_init(). Reported-by: Pavel Machek Cc: Kalle Valo Signed-off-by: Cong Wang Acked-by: Pavel Machek Signed-off-by: Kalle Valo Signed-off-by: Pavel Machek Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ti/wl1251/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/net/wireless/ti/wl1251/main.c b/drivers/net/wireless/ti/wl1251/main.c index cd4777954f87..9bee3f11898a 100644 --- a/drivers/net/wireless/ti/wl1251/main.c +++ b/drivers/net/wireless/ti/wl1251/main.c @@ -1567,6 +1567,7 @@ struct ieee80211_hw *wl1251_alloc_hw(void) wl->state = WL1251_STATE_OFF; mutex_init(&wl->mutex); + spin_lock_init(&wl->wl_lock); wl->tx_mgmt_frm_rate = DEFAULT_HW_GEN_TX_RATE; wl->tx_mgmt_frm_mod = DEFAULT_HW_GEN_MODULATION_TYPE; From 9b3dcc98d8df16913d260c8dae64ad6e5bfcb953 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 2 Aug 2017 19:50:14 +0200 Subject: [PATCH 104/546] xfrm: policy: check policy direction value commit 7bab09631c2a303f87a7eb7e3d69e888673b9b7e upstream. The 'dir' parameter in xfrm_migrate() is a user-controlled byte which is used as an array index. This can lead to an out-of-bound access, kernel lockup and DoS. Add a check for the 'dir' value. This fixes CVE-2017-11600. References: https://bugzilla.redhat.com/show_bug.cgi?id=1474928 Fixes: 80c9abaabf42 ("[XFRM]: Extension for dynamic update of endpoint address(es)") Reported-by: "bo Zhang" Signed-off-by: Vladis Dronov Signed-off-by: Steffen Klassert Signed-off-by: Greg Kroah-Hartman --- net/xfrm/xfrm_policy.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 77055a362041..0e01250f2072 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -3275,9 +3275,15 @@ int xfrm_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, struct xfrm_state *x_new[XFRM_MAX_DEPTH]; struct xfrm_migrate *mp; + /* Stage 0 - sanity checks */ if ((err = xfrm_migrate_check(m, num_migrate)) < 0) goto out; + if (dir >= XFRM_POLICY_MAX) { + err = -EINVAL; + goto out; + } + /* Stage 1 - find policy */ if ((pol = xfrm_migrate_policy_find(sel, dir, type, net)) == NULL) { err = -ENOENT; From 94183009ac0e58972c2b4bf1bf13b61b1a23971b Mon Sep 17 00:00:00 2001 From: "Xiangliang.Yu" Date: Wed, 16 Aug 2017 14:25:51 +0800 Subject: [PATCH 105/546] drm/ttm: Fix accounting error when fail to get pages for pool MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 9afae2719273fa1d406829bf3498f82dbdba71c7 upstream. When fail to get needed page for pool, need to put allocated pages into pool. But current code has a miscalculation of allocated pages, correct it. Signed-off-by: Xiangliang.Yu Reviewed-by: Christian König Reviewed-by: Monk Liu Signed-off-by: Alex Deucher Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/ttm/ttm_page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/gpu/drm/ttm/ttm_page_alloc.c b/drivers/gpu/drm/ttm/ttm_page_alloc.c index 025c429050c0..5d8dfe027b30 100644 --- a/drivers/gpu/drm/ttm/ttm_page_alloc.c +++ b/drivers/gpu/drm/ttm/ttm_page_alloc.c @@ -612,7 +612,7 @@ static void ttm_page_pool_fill_locked(struct ttm_page_pool *pool, } else { pr_err("Failed to fill pool (%p)\n", pool); /* If we have any pages left put them to the pool. */ - list_for_each_entry(p, &pool->list, lru) { + list_for_each_entry(p, &new_pages, lru) { ++cpages; } list_splice(&new_pages, &pool->list); From 57ff696f54b5c51d8d4df00295341bec17fab36f Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Wed, 3 May 2017 15:17:51 +0100 Subject: [PATCH 106/546] kvm: arm/arm64: Fix race in resetting stage2 PGD MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 6c0d706b563af732adb094c5bf807437e8963e84 upstream. In kvm_free_stage2_pgd() we check the stage2 PGD before holding the lock and proceed to take the lock if it is valid. And we unmap the page tables, followed by releasing the lock. We reset the PGD only after dropping this lock, which could cause a race condition where another thread waiting on or even holding the lock, could potentially see that the PGD is still valid and proceed to perform a stage2 operation and later encounter a NULL PGD. [223090.242280] Unable to handle kernel NULL pointer dereference at virtual address 00000040 [223090.262330] PC is at unmap_stage2_range+0x8c/0x428 [223090.262332] LR is at kvm_unmap_hva_handler+0x2c/0x3c [223090.262531] Call trace: [223090.262533] [] unmap_stage2_range+0x8c/0x428 [223090.262535] [] kvm_unmap_hva_handler+0x2c/0x3c [223090.262537] [] handle_hva_to_gpa+0xb0/0x104 [223090.262539] [] kvm_unmap_hva+0x5c/0xbc [223090.262543] [] kvm_mmu_notifier_invalidate_page+0x50/0x8c [223090.262547] [] __mmu_notifier_invalidate_page+0x5c/0x84 [223090.262551] [] try_to_unmap_one+0x1d0/0x4a0 [223090.262553] [] rmap_walk+0x1cc/0x2e0 [223090.262555] [] try_to_unmap+0x74/0xa4 [223090.262557] [] migrate_pages+0x31c/0x5ac [223090.262561] [] compact_zone+0x3fc/0x7ac [223090.262563] [] compact_zone_order+0x94/0xb0 [223090.262564] [] try_to_compact_pages+0x108/0x290 [223090.262569] [] __alloc_pages_direct_compact+0x70/0x1ac [223090.262571] [] __alloc_pages_nodemask+0x434/0x9f4 [223090.262572] [] alloc_pages_vma+0x230/0x254 [223090.262574] [] do_huge_pmd_anonymous_page+0x114/0x538 [223090.262576] [] handle_mm_fault+0xd40/0x17a4 [223090.262577] [] __get_user_pages+0x12c/0x36c [223090.262578] [] get_user_pages_unlocked+0xa4/0x1b8 [223090.262579] [] __gfn_to_pfn_memslot+0x280/0x31c [223090.262580] [] gfn_to_pfn_prot+0x4c/0x5c [223090.262582] [] kvm_handle_guest_abort+0x240/0x774 [223090.262584] [] handle_exit+0x11c/0x1ac [223090.262586] [] kvm_arch_vcpu_ioctl_run+0x31c/0x648 [223090.262587] [] kvm_vcpu_ioctl+0x378/0x768 [223090.262590] [] do_vfs_ioctl+0x324/0x5a4 [223090.262591] [] SyS_ioctl+0x90/0xa4 [223090.262595] [] el0_svc_naked+0x38/0x3c This patch moves the stage2 PGD manipulation under the lock. Reported-by: Alexander Graf Cc: Mark Rutland Cc: Marc Zyngier Cc: Paolo Bonzini Cc: Radim Krčmář Reviewed-by: Christoffer Dall Reviewed-by: Marc Zyngier Signed-off-by: Suzuki K Poulose Signed-off-by: Christoffer Dall Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index ba079e279b58..9b8a0ba33c9d 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -824,24 +824,25 @@ void stage2_unmap_vm(struct kvm *kvm) * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all * underlying level-2 and level-3 tables before freeing the actual level-1 table * and setting the struct pointer to NULL. - * - * Note we don't need locking here as this is only called when the VM is - * destroyed, which can only be done once. */ void kvm_free_stage2_pgd(struct kvm *kvm) { - if (kvm->arch.pgd == NULL) - return; + void *pgd = NULL; + void *hwpgd = NULL; spin_lock(&kvm->mmu_lock); - unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + if (kvm->arch.pgd) { + unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + pgd = kvm->arch.pgd; + hwpgd = kvm_get_hwpgd(kvm); + kvm->arch.pgd = NULL; + } spin_unlock(&kvm->mmu_lock); - kvm_free_hwpgd(kvm_get_hwpgd(kvm)); - if (KVM_PREALLOC_LEVEL > 0) - kfree(kvm->arch.pgd); - - kvm->arch.pgd = NULL; + if (hwpgd) + kvm_free_hwpgd(hwpgd); + if (KVM_PREALLOC_LEVEL > 0 && pgd) + kfree(pgd); } static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, From 628212c89faeed34704977a0edcfaaaae467a34d Mon Sep 17 00:00:00 2001 From: Suzuki K Poulose Date: Tue, 16 May 2017 10:34:54 +0100 Subject: [PATCH 107/546] kvm: arm/arm64: Force reading uncached stage2 PGD commit 2952a6070e07ebdd5896f1f5b861acad677caded upstream. Make sure we don't use a cached value of the KVM stage2 PGD while resetting the PGD. Cc: Marc Zyngier Signed-off-by: Suzuki K Poulose Reviewed-by: Christoffer Dall Signed-off-by: Christoffer Dall Signed-off-by: Suzuki K Poulose Signed-off-by: Greg Kroah-Hartman --- arch/arm/kvm/mmu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm/kvm/mmu.c b/arch/arm/kvm/mmu.c index 9b8a0ba33c9d..e8835d4e173c 100644 --- a/arch/arm/kvm/mmu.c +++ b/arch/arm/kvm/mmu.c @@ -833,7 +833,7 @@ void kvm_free_stage2_pgd(struct kvm *kvm) spin_lock(&kvm->mmu_lock); if (kvm->arch.pgd) { unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); - pgd = kvm->arch.pgd; + pgd = READ_ONCE(kvm->arch.pgd); hwpgd = kvm_get_hwpgd(kvm); kvm->arch.pgd = NULL; } From ab3ee6b53d678b37b90934fb67b01d5fc7aa3a85 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Fri, 1 Sep 2017 18:55:33 +0200 Subject: [PATCH 108/546] epoll: fix race between ep_poll_callback(POLLFREE) and ep_free()/ep_remove() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 138e4ad67afd5c6c318b056b4d17c17f2c0ca5c0 upstream. The race was introduced by me in commit 971316f0503a ("epoll: ep_unregister_pollwait() can use the freed pwq->whead"). I did not realize that nothing can protect eventpoll after ep_poll_callback() sets ->whead = NULL, only whead->lock can save us from the race with ep_free() or ep_remove(). Move ->whead = NULL to the end of ep_poll_callback() and add the necessary barriers. TODO: cleanup the ewake/EPOLLEXCLUSIVE logic, it was confusing even before this patch. Hopefully this explains use-after-free reported by syzcaller: BUG: KASAN: use-after-free in debug_spin_lock_before ... _raw_spin_lock_irqsave+0x4a/0x60 kernel/locking/spinlock.c:159 ep_poll_callback+0x29f/0xff0 fs/eventpoll.c:1148 this is spin_lock(eventpoll->lock), ... Freed by task 17774: ... kfree+0xe8/0x2c0 mm/slub.c:3883 ep_free+0x22c/0x2a0 fs/eventpoll.c:865 Fixes: 971316f0503a ("epoll: ep_unregister_pollwait() can use the freed pwq->whead") Reported-by: 范龙飞 Signed-off-by: Oleg Nesterov Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/eventpoll.c | 37 ++++++++++++++++++++++++------------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 1e009cad8d5c..1b08556776ce 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -518,8 +518,13 @@ static void ep_remove_wait_queue(struct eppoll_entry *pwq) wait_queue_head_t *whead; rcu_read_lock(); - /* If it is cleared by POLLFREE, it should be rcu-safe */ - whead = rcu_dereference(pwq->whead); + /* + * If it is cleared by POLLFREE, it should be rcu-safe. + * If we read NULL we need a barrier paired with + * smp_store_release() in ep_poll_callback(), otherwise + * we rely on whead->lock. + */ + whead = smp_load_acquire(&pwq->whead); if (whead) remove_wait_queue(whead, &pwq->wait); rcu_read_unlock(); @@ -1003,17 +1008,6 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k struct epitem *epi = ep_item_from_wait(wait); struct eventpoll *ep = epi->ep; - if ((unsigned long)key & POLLFREE) { - ep_pwq_from_wait(wait)->whead = NULL; - /* - * whead = NULL above can race with ep_remove_wait_queue() - * which can do another remove_wait_queue() after us, so we - * can't use __remove_wait_queue(). whead->lock is held by - * the caller. - */ - list_del_init(&wait->task_list); - } - spin_lock_irqsave(&ep->lock, flags); /* @@ -1078,6 +1072,23 @@ static int ep_poll_callback(wait_queue_t *wait, unsigned mode, int sync, void *k if (pwake) ep_poll_safewake(&ep->poll_wait); + + if ((unsigned long)key & POLLFREE) { + /* + * If we race with ep_remove_wait_queue() it can miss + * ->whead = NULL and do another remove_wait_queue() after + * us, so we can't use __remove_wait_queue(). + */ + list_del_init(&wait->task_list); + /* + * ->whead != NULL protects us from the race with ep_free() + * or ep_remove(), ep_remove_wait_queue() takes whead->lock + * held by the caller. Once we nullify it, nothing protects + * ep/epi or even wait. + */ + smp_store_release(&ep_pwq_from_wait(wait)->whead, NULL); + } + return 1; } From d95827490c39d20c148b3c4c3d250f9cce294eb5 Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Wed, 16 Aug 2017 11:56:24 +0200 Subject: [PATCH 109/546] crypto: algif_skcipher - only call put_page on referenced and used pages commit 445a582738de6802669aeed9c33ca406c23c3b1f upstream. For asynchronous operation, SGs are allocated without a page mapped to them or with a page that is not used (ref-counted). If the SGL is freed, the code must only call put_page for an SG if there was a page assigned and ref-counted in the first place. This fixes a kernel crash when using io_submit with more than one iocb using the sendmsg and sendpage (vmsplice/splice) interface. Signed-off-by: Stephan Mueller Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/algif_skcipher.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index f5e9f9310b48..b3b0004ea8ac 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -86,8 +86,13 @@ static void skcipher_free_async_sgls(struct skcipher_async_req *sreq) } sgl = sreq->tsg; n = sg_nents(sgl); - for_each_sg(sgl, sg, n, i) - put_page(sg_page(sg)); + for_each_sg(sgl, sg, n, i) { + struct page *page = sg_page(sg); + + /* some SGs may not have a page mapped */ + if (page && atomic_read(&page->_count)) + put_page(page); + } kfree(sreq->tsg); } From 573b59e17e37fc19577630391c08f7b1e26609d8 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 7 Sep 2017 08:34:25 +0200 Subject: [PATCH 110/546] Linux 4.4.87 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 1207bf6a0e7a..f6838187b568 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 86 +SUBLEVEL = 87 EXTRAVERSION = NAME = Blurry Fish Butt From f7a0f7318c27d889831ee354af34b986a0aa889a Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Wed, 16 Aug 2017 10:53:20 +0800 Subject: [PATCH 111/546] usb: quirks: add delay init quirk for Corsair Strafe RGB keyboard commit de3af5bf259d7a0bfaac70441c8568ab5998d80c upstream. Corsair Strafe RGB keyboard has trouble to initialize: [ 1.679455] usb 3-6: new full-speed USB device number 4 using xhci_hcd [ 6.871136] usb 3-6: unable to read config index 0 descriptor/all [ 6.871138] usb 3-6: can't read configurations, error -110 [ 6.991019] usb 3-6: new full-speed USB device number 5 using xhci_hcd [ 12.246642] usb 3-6: unable to read config index 0 descriptor/all [ 12.246644] usb 3-6: can't read configurations, error -110 [ 12.366555] usb 3-6: new full-speed USB device number 6 using xhci_hcd [ 17.622145] usb 3-6: unable to read config index 0 descriptor/all [ 17.622147] usb 3-6: can't read configurations, error -110 [ 17.742093] usb 3-6: new full-speed USB device number 7 using xhci_hcd [ 22.997715] usb 3-6: unable to read config index 0 descriptor/all [ 22.997716] usb 3-6: can't read configurations, error -110 Although it may work after several times unpluging/pluging: [ 68.195240] usb 3-6: new full-speed USB device number 11 using xhci_hcd [ 68.337459] usb 3-6: New USB device found, idVendor=1b1c, idProduct=1b20 [ 68.337463] usb 3-6: New USB device strings: Mfr=1, Product=2, SerialNumber=3 [ 68.337466] usb 3-6: Product: Corsair STRAFE RGB Gaming Keyboard [ 68.337468] usb 3-6: Manufacturer: Corsair [ 68.337470] usb 3-6: SerialNumber: 0F013021AEB8046755A93ED3F5001941 Tried three quirks: USB_QUIRK_DELAY_INIT, USB_QUIRK_NO_LPM and USB_QUIRK_DEVICE_QUALIFIER, user confirmed that USB_QUIRK_DELAY_INIT alone can workaround this issue. Hence add the quirk for Corsair Strafe RGB. BugLink: https://bugs.launchpad.net/bugs/1678477 Signed-off-by: Kai-Heng Feng Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 574da2b4529c..1ea5060dae69 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -217,6 +217,9 @@ static const struct usb_device_id usb_quirk_list[] = { { USB_DEVICE(0x1a0a, 0x0200), .driver_info = USB_QUIRK_LINEAR_UFRAME_INTR_BINTERVAL }, + /* Corsair Strafe RGB */ + { USB_DEVICE(0x1b1c, 0x1b20), .driver_info = USB_QUIRK_DELAY_INIT }, + /* Acer C120 LED Projector */ { USB_DEVICE(0x1de1, 0xc102), .driver_info = USB_QUIRK_NO_LPM }, From 6e957a81c77fd944f605f4f5d34af7cb43b1ad07 Mon Sep 17 00:00:00 2001 From: "Maciej S. Szmigiero" Date: Tue, 29 Aug 2017 21:50:03 +0200 Subject: [PATCH 112/546] USB: serial: option: add support for D-Link DWM-157 C1 commit 169e86546f5712179709de23cd64bbb15f199fab upstream. This commit adds support (an ID, really) for D-Link DWM-157 hardware version C1 USB modem to option driver. According to manufacturer-provided Windows INF file the device has four serial ports: "D-Link HSPA+DataCard Diagnostics Interface" (interface 2; modem port), "D-Link HSPA+DataCard NMEA Device" (interface 3), "D-Link HSPA+DataCard Speech Port" (interface 4), "D-Link HSPA+DataCard Debug Port" (interface 5). usb-devices output: T: Bus=05 Lev=01 Prnt=01 Port=04 Cnt=01 Dev#= 3 Spd=480 MxCh= 0 D: Ver= 2.00 Cls=ef(misc ) Sub=02 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=2001 ProdID=7d0e Rev=03.00 S: Manufacturer=D-Link,Inc S: Product=D-Link DWM-157 C: #Ifs= 7 Cfg#= 1 Atr=a0 MxPwr=500mA I: If#= 0 Alt= 0 #EPs= 1 Cls=02(commc) Sub=0e Prot=00 Driver=cdc_mbim I: If#= 1 Alt= 1 #EPs= 2 Cls=0a(data ) Sub=00 Prot=02 Driver=cdc_mbim I: If#= 2 Alt= 0 #EPs= 3 Cls=ff(vend.) Sub=02 Prot=01 Driver=option I: If#= 3 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 4 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 5 Alt= 0 #EPs= 2 Cls=ff(vend.) Sub=00 Prot=00 Driver=option I: If#= 6 Alt= 0 #EPs= 2 Cls=08(stor.) Sub=06 Prot=50 Driver=usb-storage Signed-off-by: Maciej S. Szmigiero Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index fe123153b1a5..2a9944326210 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -2023,6 +2023,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x7d03, 0xff, 0x02, 0x01) }, { USB_DEVICE_AND_INTERFACE_INFO(0x2001, 0x7d03, 0xff, 0x00, 0x00) }, { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d04, 0xff) }, /* D-Link DWM-158 */ + { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7d0e, 0xff) }, /* D-Link DWM-157 C1 */ { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7e19, 0xff), /* D-Link DWM-221 B1 */ .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, { USB_DEVICE_INTERFACE_CLASS(0x2001, 0x7e35, 0xff), /* D-Link DWM-222 */ From b3e92cd7a820d5551cdb3e034991f7c6768cc8a5 Mon Sep 17 00:00:00 2001 From: Dmitry Fleytman Date: Fri, 25 Aug 2017 10:38:35 +0300 Subject: [PATCH 113/546] usb: Add device quirk for Logitech HD Pro Webcam C920-C commit a1279ef74eeeb5f627f091c71d80dd7ac766c99d upstream. Commit e0429362ab15 ("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e") introduced quirk to workaround an issue with some Logitech webcams. Apparently model C920-C has the same issue so applying the same quirk as well. See aforementioned commit message for detailed explanation of the problem. Signed-off-by: Dmitry Fleytman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/quirks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/quirks.c b/drivers/usb/core/quirks.c index 1ea5060dae69..82806e311202 100644 --- a/drivers/usb/core/quirks.c +++ b/drivers/usb/core/quirks.c @@ -57,8 +57,9 @@ static const struct usb_device_id usb_quirk_list[] = { /* Microsoft LifeCam-VX700 v2.0 */ { USB_DEVICE(0x045e, 0x0770), .driver_info = USB_QUIRK_RESET_RESUME }, - /* Logitech HD Pro Webcams C920 and C930e */ + /* Logitech HD Pro Webcams C920, C920-C and C930e */ { USB_DEVICE(0x046d, 0x082d), .driver_info = USB_QUIRK_DELAY_INIT }, + { USB_DEVICE(0x046d, 0x0841), .driver_info = USB_QUIRK_DELAY_INIT }, { USB_DEVICE(0x046d, 0x0843), .driver_info = USB_QUIRK_DELAY_INIT }, /* Logitech ConferenceCam CC3000e */ From 9f1d78c62a4b07e9acd180f2d036be8a4769c345 Mon Sep 17 00:00:00 2001 From: Sandeep Singh Date: Thu, 24 Aug 2017 09:57:15 +0530 Subject: [PATCH 114/546] usb:xhci:Fix regression when ATI chipsets detected commit e6b422b88b46353cf596e0db6dc0e39d50d90d6e upstream. The following commit cause a regression on ATI chipsets. 'commit e788787ef4f9 ("usb:xhci:Add quirk for Certain failing HP keyboard on reset after resume")' This causes pinfo->smbus_dev to be wrongly set to NULL on systems with the ATI chipset that this function checks for first. Added conditional check for AMD chipsets to avoid the overwriting pinfo->smbus_dev. Reported-by: Ben Hutchings Fixes: e788787ef4f9 ("usb:xhci:Add quirk for Certain failing HP keyboard on reset after resume") cc: Nehal Shah Signed-off-by: Sandeep Singh Signed-off-by: Shyam Sundar S K Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 35 ++++++++++++++++++----------------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 03b9a372636f..1fc6f478a02c 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -133,29 +133,30 @@ static int amd_chipset_sb_type_init(struct amd_chipset_info *pinfo) pinfo->sb_type.gen = AMD_CHIPSET_SB700; else if (rev >= 0x40 && rev <= 0x4f) pinfo->sb_type.gen = AMD_CHIPSET_SB800; - } - pinfo->smbus_dev = pci_get_device(PCI_VENDOR_ID_AMD, - 0x145c, NULL); - if (pinfo->smbus_dev) { - pinfo->sb_type.gen = AMD_CHIPSET_TAISHAN; } else { pinfo->smbus_dev = pci_get_device(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_HUDSON2_SMBUS, NULL); - if (!pinfo->smbus_dev) { - pinfo->sb_type.gen = NOT_AMD_CHIPSET; - return 0; + if (pinfo->smbus_dev) { + rev = pinfo->smbus_dev->revision; + if (rev >= 0x11 && rev <= 0x14) + pinfo->sb_type.gen = AMD_CHIPSET_HUDSON2; + else if (rev >= 0x15 && rev <= 0x18) + pinfo->sb_type.gen = AMD_CHIPSET_BOLTON; + else if (rev >= 0x39 && rev <= 0x3a) + pinfo->sb_type.gen = AMD_CHIPSET_YANGTZE; + } else { + pinfo->smbus_dev = pci_get_device(PCI_VENDOR_ID_AMD, + 0x145c, NULL); + if (pinfo->smbus_dev) { + rev = pinfo->smbus_dev->revision; + pinfo->sb_type.gen = AMD_CHIPSET_TAISHAN; + } else { + pinfo->sb_type.gen = NOT_AMD_CHIPSET; + return 0; + } } - - rev = pinfo->smbus_dev->revision; - if (rev >= 0x11 && rev <= 0x14) - pinfo->sb_type.gen = AMD_CHIPSET_HUDSON2; - else if (rev >= 0x15 && rev <= 0x18) - pinfo->sb_type.gen = AMD_CHIPSET_BOLTON; - else if (rev >= 0x39 && rev <= 0x3a) - pinfo->sb_type.gen = AMD_CHIPSET_YANGTZE; } - pinfo->sb_type.rev = rev; return 1; } From 812e484133fb71f937d04e21096d6b6ba55c2f7b Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 10 Aug 2017 15:42:22 -0700 Subject: [PATCH 115/546] USB: core: Avoid race of async_completed() w/ usbdev_release() commit ed62ca2f4f51c17841ea39d98c0c409cb53a3e10 upstream. While running reboot tests w/ a specific set of USB devices (and slub_debug enabled), I found that once every few hours my device would be crashed with a stack that looked like this: [ 14.012445] BUG: spinlock bad magic on CPU#0, modprobe/2091 [ 14.012460] lock: 0xffffffc0cb055978, .magic: ffffffc0, .owner: cryption contexts: %lu/%lu [ 14.012460] /1025536097, .owner_cpu: 0 [ 14.012466] CPU: 0 PID: 2091 Comm: modprobe Not tainted 4.4.79 #352 [ 14.012468] Hardware name: Google Kevin (DT) [ 14.012471] Call trace: [ 14.012483] [<....>] dump_backtrace+0x0/0x160 [ 14.012487] [<....>] show_stack+0x20/0x28 [ 14.012494] [<....>] dump_stack+0xb4/0xf0 [ 14.012500] [<....>] spin_dump+0x8c/0x98 [ 14.012504] [<....>] spin_bug+0x30/0x3c [ 14.012508] [<....>] do_raw_spin_lock+0x40/0x164 [ 14.012515] [<....>] _raw_spin_lock_irqsave+0x64/0x74 [ 14.012521] [<....>] __wake_up+0x2c/0x60 [ 14.012528] [<....>] async_completed+0x2d0/0x300 [ 14.012534] [<....>] __usb_hcd_giveback_urb+0xc4/0x138 [ 14.012538] [<....>] usb_hcd_giveback_urb+0x54/0xf0 [ 14.012544] [<....>] xhci_irq+0x1314/0x1348 [ 14.012548] [<....>] usb_hcd_irq+0x40/0x50 [ 14.012553] [<....>] handle_irq_event_percpu+0x1b4/0x3f0 [ 14.012556] [<....>] handle_irq_event+0x4c/0x7c [ 14.012561] [<....>] handle_fasteoi_irq+0x158/0x1c8 [ 14.012564] [<....>] generic_handle_irq+0x30/0x44 [ 14.012568] [<....>] __handle_domain_irq+0x90/0xbc [ 14.012572] [<....>] gic_handle_irq+0xcc/0x18c Investigation using kgdb() found that the wait queue that was passed into wake_up() had been freed (it was filled with slub_debug poison). I analyzed and instrumented the code and reproduced. My current belief is that this is happening: 1. async_completed() is called (from IRQ). Moves "as" onto the completed list. 2. On another CPU, proc_reapurbnonblock_compat() calls async_getcompleted(). Blocks on spinlock. 3. async_completed() releases the lock; keeps running; gets blocked midway through wake_up(). 4. proc_reapurbnonblock_compat() => async_getcompleted() gets the lock; removes "as" from completed list and frees it. 5. usbdev_release() is called. Frees "ps". 6. async_completed() finally continues running wake_up(). ...but wake_up() has a pointer to the freed "ps". The instrumentation that led me to believe this was based on adding some trace_printk() calls in a select few functions and then using kdb's "ftdump" at crash time. The trace follows (NOTE: in the trace below I cheated a little bit and added a udelay(1000) in async_completed() after releasing the spinlock because I wanted it to trigger quicker): <...>-2104 0d.h2 13759034us!: async_completed at start: as=ffffffc0cc638200 mtpd-2055 3.... 13759356us : async_getcompleted before spin_lock_irqsave mtpd-2055 3d..1 13759362us : async_getcompleted after list_del_init: as=ffffffc0cc638200 mtpd-2055 3.... 13759371us+: proc_reapurbnonblock_compat: free_async(ffffffc0cc638200) mtpd-2055 3.... 13759422us+: async_getcompleted before spin_lock_irqsave mtpd-2055 3.... 13759479us : usbdev_release at start: ps=ffffffc0cc042080 mtpd-2055 3.... 13759487us : async_getcompleted before spin_lock_irqsave mtpd-2055 3.... 13759497us!: usbdev_release after kfree(ps): ps=ffffffc0cc042080 <...>-2104 0d.h2 13760294us : async_completed before wake_up(): as=ffffffc0cc638200 To fix this problem we can just move the wake_up() under the ps->lock. There should be no issues there that I'm aware of. Signed-off-by: Douglas Anderson Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 54d2d6b604c0..873ba02d59e6 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -519,6 +519,8 @@ static void async_completed(struct urb *urb) if (as->status < 0 && as->bulk_addr && as->status != -ECONNRESET && as->status != -ENOENT) cancel_bulk_urbs(ps, as->bulk_addr); + + wake_up(&ps->wait); spin_unlock(&ps->lock); if (signr) { @@ -526,8 +528,6 @@ static void async_completed(struct urb *urb) put_pid(pid); put_cred(cred); } - - wake_up(&ps->wait); } static void destroy_async(struct usb_dev_state *ps, struct list_head *list) From f3584d55a8d8d0cff5a423e81162e069693c1914 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Aug 2017 14:34:16 +0100 Subject: [PATCH 116/546] staging/rts5208: fix incorrect shift to extract upper nybble commit 34ff1bf4920471cff66775dc39537b15c5f0feff upstream. The mask of sns_key_info1 suggests the upper nybble is being extracted however the following shift of 8 bits is too large and always results in 0. Fix this by shifting only by 4 bits to correctly get the upper nybble. Detected by CoverityScan, CID#142891 ("Operands don't affect result") Fixes: fa590c222fba ("staging: rts5208: add support for rts5208 and rts5288") Signed-off-by: Colin Ian King Signed-off-by: Greg Kroah-Hartman --- drivers/staging/rts5208/rtsx_scsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/rts5208/rtsx_scsi.c b/drivers/staging/rts5208/rtsx_scsi.c index 60871f3022b1..12a3893b98fd 100644 --- a/drivers/staging/rts5208/rtsx_scsi.c +++ b/drivers/staging/rts5208/rtsx_scsi.c @@ -414,7 +414,7 @@ void set_sense_data(struct rtsx_chip *chip, unsigned int lun, u8 err_code, sense->ascq = ascq; if (sns_key_info0 != 0) { sense->sns_key_info[0] = SKSV | sns_key_info0; - sense->sns_key_info[1] = (sns_key_info1 & 0xf0) >> 8; + sense->sns_key_info[1] = (sns_key_info1 & 0xf0) >> 4; sense->sns_key_info[2] = sns_key_info1 & 0x0f; } } From 1875ed81c2b747877a6c868d6aa25738ca77d27d Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Tue, 29 Aug 2017 21:23:49 +0200 Subject: [PATCH 117/546] driver core: bus: Fix a potential double free commit 0f9b011d3321ca1079c7a46c18cb1956fbdb7bcb upstream. The .release function of driver_ktype is 'driver_release()'. This function frees the container_of this kobject. So, this memory must not be freed explicitly in the error handling path of 'bus_add_driver()'. Otherwise a double free will occur. Signed-off-by: Christophe JAILLET Signed-off-by: Greg Kroah-Hartman --- drivers/base/bus.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/base/bus.c b/drivers/base/bus.c index 500592486e88..0346e46e2871 100644 --- a/drivers/base/bus.c +++ b/drivers/base/bus.c @@ -737,7 +737,7 @@ int bus_add_driver(struct device_driver *drv) out_unregister: kobject_put(&priv->kobj); - kfree(drv->p); + /* drv->p is freed in driver_release() */ drv->p = NULL; out_put_bus: bus_put(bus); From eb98d15d3cbecca8e603ec60ad272d7b2c8963b6 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jun 2016 16:11:13 +0300 Subject: [PATCH 118/546] intel_th: pci: Add Cannon Lake PCH-H support commit 84331e1390b6378a5129a3678c87a42c6f697d29 upstream. This adds Intel(R) Trace Hub PCI ID for Cannon Lake PCH-H. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index d57a2f75dccf..32f6e530db58 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -72,6 +72,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa2a6), .driver_data = (kernel_ulong_t)0, }, + { + /* Cannon Lake H */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa326), + .driver_data = (kernel_ulong_t)0, + }, { 0 }, }; From 69eeacb5cd876ba2afd14aad5ea6016281d66cbe Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Thu, 30 Jun 2016 16:11:31 +0300 Subject: [PATCH 119/546] intel_th: pci: Add Cannon Lake PCH-LP support commit efb3669e14fe17d0ec4ecf57d0365039fe726f59 upstream. This adds Intel(R) Trace Hub PCI ID for Cannon Lake PCH-LP. Signed-off-by: Alexander Shishkin Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/intel_th/pci.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/hwtracing/intel_th/pci.c b/drivers/hwtracing/intel_th/pci.c index 32f6e530db58..32c6a40a408f 100644 --- a/drivers/hwtracing/intel_th/pci.c +++ b/drivers/hwtracing/intel_th/pci.c @@ -77,6 +77,11 @@ static const struct pci_device_id intel_th_pci_id_table[] = { PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0xa326), .driver_data = (kernel_ulong_t)0, }, + { + /* Cannon Lake LP */ + PCI_DEVICE(PCI_VENDOR_ID_INTEL, 0x9da6), + .driver_data = (kernel_ulong_t)0, + }, { 0 }, }; From 2c65494080c900c8a0aa4a865b57a8001960ff26 Mon Sep 17 00:00:00 2001 From: Rakesh Pillai Date: Wed, 2 Aug 2017 16:03:37 +0530 Subject: [PATCH 120/546] ath10k: fix memory leak in rx ring buffer allocation commit f35a7f91f66af528b3ee1921de16bea31d347ab0 upstream. The rx ring buffers are added to a hash table if firmware support full rx reorder. If the full rx reorder support flag is not set before allocating the rx ring buffers, none of the buffers are added to the hash table. There is a race condition between rx ring refill and rx buffer replenish from napi poll. The interrupts are enabled in hif start, before the rx ring is refilled during init. We replenish buffers from napi poll due to the interrupts which get enabled after hif start. Hence before the entire rx ring is refilled during the init, the napi poll replenishes a few buffers in steps of 100 buffers per attempt. During this rx ring replenish from napi poll, the rx reorder flag has not been set due to which the replenished buffers are not added to the hash table Set the rx full reorder support flag before we allocate the rx ring buffer to avoid the memory leak. Signed-off-by: Rakesh Pillai Signed-off-by: Kalle Valo Cc: Christian Lamparter Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/ath/ath10k/core.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/net/wireless/ath/ath10k/core.c b/drivers/net/wireless/ath/ath10k/core.c index 531de256d58d..05de75360fa4 100644 --- a/drivers/net/wireless/ath/ath10k/core.c +++ b/drivers/net/wireless/ath/ath10k/core.c @@ -1607,6 +1607,12 @@ int ath10k_core_start(struct ath10k *ar, enum ath10k_firmware_mode mode) goto err_wmi_detach; } + /* If firmware indicates Full Rx Reorder support it must be used in a + * slightly different manner. Let HTT code know. + */ + ar->htt.rx_ring.in_ord_rx = !!(test_bit(WMI_SERVICE_RX_FULL_REORDER, + ar->wmi.svc_map)); + status = ath10k_htt_rx_alloc(&ar->htt); if (status) { ath10k_err(ar, "failed to alloc htt rx: %d\n", status); @@ -1669,12 +1675,6 @@ int ath10k_core_start(struct ath10k *ar, enum ath10k_firmware_mode mode) goto err_hif_stop; } - /* If firmware indicates Full Rx Reorder support it must be used in a - * slightly different manner. Let HTT code know. - */ - ar->htt.rx_ring.in_ord_rx = !!(test_bit(WMI_SERVICE_RX_FULL_REORDER, - ar->wmi.svc_map)); - status = ath10k_htt_rx_ring_refill(ar); if (status) { ath10k_err(ar, "failed to refill htt rx ring: %d\n", status); From c5b8e1dd96299c0dd294e390485c759174844eb4 Mon Sep 17 00:00:00 2001 From: Oscar Campos Date: Tue, 18 Jul 2017 17:20:36 -0700 Subject: [PATCH 121/546] Input: trackpoint - assume 3 buttons when buttons detection fails commit 293b915fd9bebf33cdc906516fb28d54649a25ac upstream. Trackpoint buttons detection fails on ThinkPad 570 and 470 series, this makes the middle button of the trackpoint to not being recogized. As I don't believe there is any trackpoint with less than 3 buttons this patch just assumes three buttons when the extended button information read fails. Signed-off-by: Oscar Campos Acked-by: Peter Hutterer Signed-off-by: Dmitry Torokhov Signed-off-by: Aaron Ma Signed-off-by: Greg Kroah-Hartman --- drivers/input/mouse/trackpoint.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/input/mouse/trackpoint.c b/drivers/input/mouse/trackpoint.c index ce6ff9b301bb..7e2dc5e56632 100644 --- a/drivers/input/mouse/trackpoint.c +++ b/drivers/input/mouse/trackpoint.c @@ -381,8 +381,8 @@ int trackpoint_detect(struct psmouse *psmouse, bool set_properties) return 0; if (trackpoint_read(&psmouse->ps2dev, TP_EXT_BTN, &button_info)) { - psmouse_warn(psmouse, "failed to get extended button data\n"); - button_info = 0; + psmouse_warn(psmouse, "failed to get extended button data, assuming 3 buttons\n"); + button_info = 0x33; } psmouse->private = kzalloc(sizeof(struct trackpoint_data), GFP_KERNEL); From ca245a6414e4dba43c144939687df81fc4fba7b2 Mon Sep 17 00:00:00 2001 From: Malcolm Priestley Date: Sun, 30 Jul 2017 09:02:19 +0100 Subject: [PATCH 122/546] rtlwifi: rtl_pci_probe: Fix fail path of _rtl_pci_find_adapter commit fc81bab5eeb103711925d7510157cf5cd2b153f4 upstream. _rtl_pci_find_adapter fail path will jump to label fail3 for unsupported adapter types. However, on course for fail3 there will be call rtl_deinit_core before rtl_init_core. For the inclusion of checking pci_iounmap this fail can be moved to fail2. Fixes [ 4.492963] BUG: unable to handle kernel NULL pointer dereference at (null) [ 4.493067] IP: rtl_deinit_core+0x31/0x90 [rtlwifi] Signed-off-by: Malcolm Priestley Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/realtek/rtlwifi/pci.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/net/wireless/realtek/rtlwifi/pci.c b/drivers/net/wireless/realtek/rtlwifi/pci.c index a52230377e2c..c48b7e8ee0d6 100644 --- a/drivers/net/wireless/realtek/rtlwifi/pci.c +++ b/drivers/net/wireless/realtek/rtlwifi/pci.c @@ -2269,7 +2269,7 @@ int rtl_pci_probe(struct pci_dev *pdev, /* find adapter */ if (!_rtl_pci_find_adapter(pdev, hw)) { err = -ENODEV; - goto fail3; + goto fail2; } /* Init IO handler */ @@ -2339,10 +2339,10 @@ int rtl_pci_probe(struct pci_dev *pdev, pci_set_drvdata(pdev, NULL); rtl_deinit_core(hw); +fail2: if (rtlpriv->io.pci_mem_start != 0) pci_iounmap(pdev, (void __iomem *)rtlpriv->io.pci_mem_start); -fail2: pci_release_regions(pdev); complete(&rtlpriv->firmware_loading_complete); From bf3a0acce440d1c2efe95af3c9cbec68b0ea21a5 Mon Sep 17 00:00:00 2001 From: Dmitry Tunin Date: Tue, 8 Aug 2017 14:09:02 +0300 Subject: [PATCH 123/546] Bluetooth: Add support of 13d3:3494 RTL8723BE device commit a81d72d2002d6a932bd83022cbf8c442b1b97512 upstream. T: Bus=02 Lev=01 Prnt=01 Port=03 Cnt=03 Dev#= 4 Spd=12 MxCh= 0 D: Ver= 2.10 Cls=e0(wlcon) Sub=01 Prot=01 MxPS=64 #Cfgs= 1 P: Vendor=13d3 ProdID=3494 Rev= 2.00 S: Manufacturer=Realtek S: Product=Bluetooth Radio S: SerialNumber=00e04c000001 C:* #Ifs= 2 Cfg#= 1 Atr=e0 MxPwr=500mA I:* If#= 0 Alt= 0 #EPs= 3 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=81(I) Atr=03(Int.) MxPS= 16 Ivl=1ms E: Ad=02(O) Atr=02(Bulk) MxPS= 64 Ivl=0ms E: Ad=82(I) Atr=02(Bulk) MxPS= 64 Ivl=0ms I:* If#= 1 Alt= 0 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 0 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 0 Ivl=1ms I: If#= 1 Alt= 1 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 9 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 9 Ivl=1ms I: If#= 1 Alt= 2 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 17 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 17 Ivl=1ms I: If#= 1 Alt= 3 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 25 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 25 Ivl=1ms I: If#= 1 Alt= 4 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 33 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 33 Ivl=1ms I: If#= 1 Alt= 5 #EPs= 2 Cls=e0(wlcon) Sub=01 Prot=01 Driver=btusb E: Ad=03(O) Atr=01(Isoc) MxPS= 49 Ivl=1ms E: Ad=83(I) Atr=01(Isoc) MxPS= 49 Ivl=1ms Signed-off-by: Dmitry Tunin Signed-off-by: Marcel Holtmann Signed-off-by: Greg Kroah-Hartman --- drivers/bluetooth/btusb.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c index cd6b141b9825..7bb8055bd10c 100644 --- a/drivers/bluetooth/btusb.c +++ b/drivers/bluetooth/btusb.c @@ -333,6 +333,7 @@ static const struct usb_device_id blacklist_table[] = { { USB_DEVICE(0x13d3, 0x3410), .driver_info = BTUSB_REALTEK }, { USB_DEVICE(0x13d3, 0x3416), .driver_info = BTUSB_REALTEK }, { USB_DEVICE(0x13d3, 0x3459), .driver_info = BTUSB_REALTEK }, + { USB_DEVICE(0x13d3, 0x3494), .driver_info = BTUSB_REALTEK }, /* Additional Realtek 8821AE Bluetooth devices */ { USB_DEVICE(0x0b05, 0x17dc), .driver_info = BTUSB_REALTEK }, From 926374f5e66914f7b835eaa0f2c71144dc33e974 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Edwin=20T=C3=B6r=C3=B6k?= Date: Thu, 3 Aug 2017 10:30:06 +0100 Subject: [PATCH 124/546] dlm: avoid double-free on error path in dlm_device_{register,unregister} MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 55acdd926f6b21a5cdba23da98a48aedf19ac9c3 upstream. Can be reproduced when running dlm_controld (tested on 4.4.x, 4.12.4): # seq 1 100 | xargs -P0 -n1 dlm_tool join # seq 1 100 | xargs -P0 -n1 dlm_tool leave misc_register fails due to duplicate sysfs entry, which causes dlm_device_register to free ls->ls_device.name. In dlm_device_deregister the name was freed again, causing memory corruption. According to the comment in dlm_device_deregister the name should've been set to NULL when registration fails, so this patch does that. sysfs: cannot create duplicate filename '/dev/char/10:1' ------------[ cut here ]------------ warning: cpu: 1 pid: 4450 at fs/sysfs/dir.c:31 sysfs_warn_dup+0x56/0x70 modules linked in: msr rfcomm dlm ccm bnep dm_crypt uvcvideo videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 videobuf2_core videodev btusb media btrtl btbcm btintel bluetooth ecdh_generic intel_rapl x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm snd_hda_codec_hdmi irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel thinkpad_acpi pcbc nvram snd_seq_midi snd_seq_midi_event aesni_intel snd_hda_codec_realtek snd_hda_codec_generic snd_rawmidi aes_x86_64 crypto_simd glue_helper snd_hda_intel snd_hda_codec cryptd intel_cstate arc4 snd_hda_core snd_seq snd_seq_device snd_hwdep iwldvm intel_rapl_perf mac80211 joydev input_leds iwlwifi serio_raw cfg80211 snd_pcm shpchp snd_timer snd mac_hid mei_me lpc_ich mei soundcore sunrpc parport_pc ppdev lp parport autofs4 i915 psmouse e1000e ahci libahci i2c_algo_bit sdhci_pci ptp drm_kms_helper sdhci pps_core syscopyarea sysfillrect sysimgblt fb_sys_fops drm wmi video cpu: 1 pid: 4450 comm: dlm_test.exe not tainted 4.12.4-041204-generic hardware name: lenovo 232425u/232425u, bios g2et82ww (2.02 ) 09/11/2012 task: ffff96b0cbabe140 task.stack: ffffb199027d0000 rip: 0010:sysfs_warn_dup+0x56/0x70 rsp: 0018:ffffb199027d3c58 eflags: 00010282 rax: 0000000000000038 rbx: ffff96b0e2c49158 rcx: 0000000000000006 rdx: 0000000000000000 rsi: 0000000000000086 rdi: ffff96b15e24dcc0 rbp: ffffb199027d3c70 r08: 0000000000000001 r09: 0000000000000721 r10: ffffb199027d3c00 r11: 0000000000000721 r12: ffffb199027d3cd1 r13: ffff96b1592088f0 r14: 0000000000000001 r15: ffffffffffffffef fs: 00007f78069c0700(0000) gs:ffff96b15e240000(0000) knlgs:0000000000000000 cs: 0010 ds: 0000 es: 0000 cr0: 0000000080050033 cr2: 000000178625ed28 cr3: 0000000091d3e000 cr4: 00000000001406e0 call trace: sysfs_do_create_link_sd.isra.2+0x9e/0xb0 sysfs_create_link+0x25/0x40 device_add+0x5a9/0x640 device_create_groups_vargs+0xe0/0xf0 device_create_with_groups+0x3f/0x60 ? snprintf+0x45/0x70 misc_register+0x140/0x180 device_write+0x6a8/0x790 [dlm] __vfs_write+0x37/0x160 ? apparmor_file_permission+0x1a/0x20 ? security_file_permission+0x3b/0xc0 vfs_write+0xb5/0x1a0 sys_write+0x55/0xc0 ? sys_fcntl+0x5d/0xb0 entry_syscall_64_fastpath+0x1e/0xa9 rip: 0033:0x7f78083454bd rsp: 002b:00007f78069bbd30 eflags: 00000293 orig_rax: 0000000000000001 rax: ffffffffffffffda rbx: 0000000000000006 rcx: 00007f78083454bd rdx: 000000000000009c rsi: 00007f78069bee00 rdi: 0000000000000005 rbp: 00007f77f8000a20 r08: 000000000000fcf0 r09: 0000000000000032 r10: 0000000000000024 r11: 0000000000000293 r12: 00007f78069bde00 r13: 00007f78069bee00 r14: 000000000000000a r15: 00007f78069bbd70 code: 85 c0 48 89 c3 74 12 b9 00 10 00 00 48 89 c2 31 f6 4c 89 ef e8 2c c8 ff ff 4c 89 e2 48 89 de 48 c7 c7 b0 8e 0c a8 e8 41 e8 ed ff <0f> ff 48 89 df e8 00 d5 f4 ff 5b 41 5c 41 5d 5d c3 66 0f 1f 84 ---[ end trace 40412246357cc9e0 ]--- dlm: 59f24629-ae39-44e2-9030-397ebc2eda26: leaving the lockspace group... bug: unable to handle kernel null pointer dereference at 0000000000000001 ip: [] kmem_cache_alloc+0x7a/0x140 pgd 0 oops: 0000 [#1] smp modules linked in: dlm 8021q garp mrp stp llc openvswitch nf_defrag_ipv6 nf_conntrack libcrc32c iptable_filter dm_multipath crc32_pclmul dm_mod aesni_intel psmouse aes_x86_64 sg ablk_helper cryptd lrw gf128mul glue_helper i2c_piix4 nls_utf8 tpm_tis tpm isofs nfsd auth_rpcgss oid_registry nfs_acl lockd grace sunrpc xen_wdt ip_tables x_tables autofs4 hid_generic usbhid hid sr_mod cdrom sd_mod ata_generic pata_acpi 8139too serio_raw ata_piix 8139cp mii uhci_hcd ehci_pci ehci_hcd libata scsi_dh_rdac scsi_dh_hp_sw scsi_dh_emc scsi_dh_alua scsi_mod ipv6 cpu: 0 pid: 394 comm: systemd-udevd tainted: g w 4.4.0+0 #1 hardware name: xen hvm domu, bios 4.7.2-2.2 05/11/2017 task: ffff880002410000 ti: ffff88000243c000 task.ti: ffff88000243c000 rip: e030:[] [] kmem_cache_alloc+0x7a/0x140 rsp: e02b:ffff88000243fd90 eflags: 00010202 rax: 0000000000000000 rbx: ffff8800029864d0 rcx: 000000000007b36c rdx: 000000000007b36b rsi: 00000000024000c0 rdi: ffff880036801c00 rbp: ffff88000243fdc0 r08: 0000000000018880 r09: 0000000000000054 r10: 000000000000004a r11: ffff880034ace6c0 r12: 00000000024000c0 r13: ffff880036801c00 r14: 0000000000000001 r15: ffffffff8118dcc2 fs: 00007f0ab77548c0(0000) gs:ffff880036e00000(0000) knlgs:0000000000000000 cs: e033 ds: 0000 es: 0000 cr0: 0000000080050033 cr2: 0000000000000001 cr3: 000000000332d000 cr4: 0000000000040660 stack: ffffffff8118dc90 ffff8800029864d0 0000000000000000 ffff88003430b0b0 ffff880034b78320 ffff88003430b0b0 ffff88000243fdf8 ffffffff8118dcc2 ffff8800349c6700 ffff8800029864d0 000000000000000b 00007f0ab7754b90 call trace: [] ? anon_vma_fork+0x60/0x140 [] anon_vma_fork+0x92/0x140 [] copy_process+0xcae/0x1a80 [] _do_fork+0x8b/0x2d0 [] sys_clone+0x19/0x20 [] entry_syscall_64_fastpath+0x12/0x71 ] code: f6 75 1c 4c 89 fa 44 89 e6 4c 89 ef e8 a7 e4 00 00 41 f7 c4 00 80 00 00 49 89 c6 74 47 eb 32 49 63 45 20 48 8d 4a 01 4d 8b 45 00 <49> 8b 1c 06 4c 89 f0 65 49 0f c7 08 0f 94 c0 84 c0 74 ac 49 63 rip [] kmem_cache_alloc+0x7a/0x140 rsp cr2: 0000000000000001 --[ end trace 70cb9fd1b164a0e8 ]-- Signed-off-by: Edwin Török Signed-off-by: David Teigland Signed-off-by: Greg Kroah-Hartman --- fs/dlm/user.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 173b3873a4f4..e40c440a4555 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -355,6 +355,10 @@ static int dlm_device_register(struct dlm_ls *ls, char *name) error = misc_register(&ls->ls_device); if (error) { kfree(ls->ls_device.name); + /* this has to be set to NULL + * to avoid a double-free in dlm_device_deregister + */ + ls->ls_device.name = NULL; } fail: return error; From 4a9c294d7b1e7cf965e4b2e2e9955b9b548a1a0b Mon Sep 17 00:00:00 2001 From: Brian Norris Date: Thu, 29 Jun 2017 18:23:54 -0700 Subject: [PATCH 125/546] mwifiex: correct channel stat buffer overflows commit 4b5dde2d6234ff5bc68e97e6901d1f2a0a7f3749 upstream. mwifiex records information about various channels as it receives scan information. It does this by appending to a buffer that was sized to the max number of supported channels on any band, but there are numerous problems: (a) scans can return info from more than one band (e.g., both 2.4 and 5 GHz), so the determined "max" is not large enough (b) some firmware appears to return multiple results for a given channel, so the max *really* isn't large enough (c) there is no bounds checking when stashing these stats, so problems (a) and (b) can easily lead to buffer overflows Let's patch this by setting a slightly-more-correct max (that accounts for a combination of both 2.4G and 5G bands) and adding a bounds check when writing to our statistics buffer. Due to problem (b), we still might not properly report all known survey information (e.g., with "iw survey dump"), since duplicate results (or otherwise "larger than expected" results) will cause some truncation. But that's a problem for a future bugfix. (And because of this known deficiency, only log the excess at the WARN level, since that isn't visible by default in this driver and would otherwise be a bit too noisy.) Fixes: bf35443314ac ("mwifiex: channel statistics support for mwifiex") Cc: Avinash Patil Cc: Xinming Hu Signed-off-by: Brian Norris Reviewed-by: Dmitry Torokhov Reviewed-by: Ganapathi Bhat Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/mwifiex/cfg80211.c | 2 +- drivers/net/wireless/mwifiex/scan.c | 6 ++++++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/mwifiex/cfg80211.c b/drivers/net/wireless/mwifiex/cfg80211.c index c3331d6201c3..9a8982f581c5 100644 --- a/drivers/net/wireless/mwifiex/cfg80211.c +++ b/drivers/net/wireless/mwifiex/cfg80211.c @@ -3740,7 +3740,7 @@ int mwifiex_init_channel_scan_gap(struct mwifiex_adapter *adapter) if (adapter->config_bands & BAND_A) n_channels_a = mwifiex_band_5ghz.n_channels; - adapter->num_in_chan_stats = max_t(u32, n_channels_bg, n_channels_a); + adapter->num_in_chan_stats = n_channels_bg + n_channels_a; adapter->chan_stats = vmalloc(sizeof(*adapter->chan_stats) * adapter->num_in_chan_stats); diff --git a/drivers/net/wireless/mwifiex/scan.c b/drivers/net/wireless/mwifiex/scan.c index c20017ced566..fb98f42cb5e7 100644 --- a/drivers/net/wireless/mwifiex/scan.c +++ b/drivers/net/wireless/mwifiex/scan.c @@ -2170,6 +2170,12 @@ mwifiex_update_chan_statistics(struct mwifiex_private *priv, sizeof(struct mwifiex_chan_stats); for (i = 0 ; i < num_chan; i++) { + if (adapter->survey_idx >= adapter->num_in_chan_stats) { + mwifiex_dbg(adapter, WARN, + "FW reported too many channel results (max %d)\n", + adapter->num_in_chan_stats); + return; + } chan_stats.chan_num = fw_chan_stats->chan_num; chan_stats.bandcfg = fw_chan_stats->bandcfg; chan_stats.flags = fw_chan_stats->flags; From 966e3a2d98c18afb58e7c396d8b066cd893b2c10 Mon Sep 17 00:00:00 2001 From: Ilia Mirkin Date: Thu, 10 Aug 2017 12:13:40 -0400 Subject: [PATCH 126/546] drm/nouveau/pci/msi: disable MSI on big-endian platforms by default commit bc60c90f472b6e762ea96ef384072145adc8d4af upstream. It appears that MSI does not work on either G5 PPC nor on a E5500-based platform, where other hardware is reported to work fine with MSI. Both tests were conducted with NV4x hardware, so perhaps other (or even this) hardware can be made to work. It's still possible to force-enable with config=NvMSI=1 on load. Signed-off-by: Ilia Mirkin Signed-off-by: Ben Skeggs Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c index d671dcfaff3c..4896474da320 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/pci/base.c @@ -180,6 +180,10 @@ nvkm_pci_new_(const struct nvkm_pci_func *func, struct nvkm_device *device, } } +#ifdef __BIG_ENDIAN + pci->msi = false; +#endif + pci->msi = nvkm_boolopt(device->cfgopt, "NvMSI", pci->msi); if (pci->msi && func->msi_rearm) { pci->msi = pci_enable_msi(pci->pdev) == 0; From 302364990c0511009a1ff4de47ac448ad0e1ce04 Mon Sep 17 00:00:00 2001 From: Ben Hutchings Date: Sun, 3 Sep 2017 01:18:41 +0100 Subject: [PATCH 127/546] workqueue: Fix flag collision commit fbf1c41fc0f4d3574ac2377245efd666c1fa3075 upstream. Commit 0a94efb5acbb ("workqueue: implicit ordered attribute should be overridable") introduced a __WQ_ORDERED_EXPLICIT flag but gave it the same value as __WQ_LEGACY. I don't believe these were intended to mean the same thing, so renumber __WQ_ORDERED_EXPLICIT. Fixes: 0a94efb5acbb ("workqueue: implicit ordered attribute should be ...") Signed-off-by: Ben Hutchings Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- include/linux/workqueue.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h index 262d5c95dfc8..217abe56e711 100644 --- a/include/linux/workqueue.h +++ b/include/linux/workqueue.h @@ -311,7 +311,7 @@ enum { __WQ_DRAINING = 1 << 16, /* internal: workqueue is draining */ __WQ_ORDERED = 1 << 17, /* internal: workqueue is ordered */ - __WQ_ORDERED_EXPLICIT = 1 << 18, /* internal: alloc_ordered_workqueue() */ + __WQ_ORDERED_EXPLICIT = 1 << 19, /* internal: alloc_ordered_workqueue() */ WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ WQ_MAX_UNBOUND_PER_CPU = 4, /* 4 * #cpus for unbound wq */ From 9a4cabf3bf8b0ac0993b9c45a37def6ec2d60da6 Mon Sep 17 00:00:00 2001 From: Andrey Korolyov Date: Thu, 10 Aug 2017 13:21:14 +0300 Subject: [PATCH 128/546] cs5536: add support for IDE controller variant commit 591b6bb605785c12a21e8b07a08a277065b655a5 upstream. Several legacy devices such as Geode-based Cisco ASA appliances and DB800 development board do possess CS5536 IDE controller with different PCI id than existing one. Using pata_generic is not always feasible as at least DB800 requires MSR quirk from pata_cs5536 to be used with vendor firmware. Signed-off-by: Andrey Korolyov Signed-off-by: Tejun Heo Signed-off-by: Greg Kroah-Hartman --- drivers/ata/pata_amd.c | 1 + drivers/ata/pata_cs5536.c | 1 + include/linux/pci_ids.h | 1 + 3 files changed, 3 insertions(+) diff --git a/drivers/ata/pata_amd.c b/drivers/ata/pata_amd.c index 8d4d959a821c..8706533db57b 100644 --- a/drivers/ata/pata_amd.c +++ b/drivers/ata/pata_amd.c @@ -616,6 +616,7 @@ static const struct pci_device_id amd[] = { { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP73_IDE), 8 }, { PCI_VDEVICE(NVIDIA, PCI_DEVICE_ID_NVIDIA_NFORCE_MCP77_IDE), 8 }, { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5536_IDE), 9 }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5536_DEV_IDE), 9 }, { }, }; diff --git a/drivers/ata/pata_cs5536.c b/drivers/ata/pata_cs5536.c index 6c15a554efbe..dc1255294628 100644 --- a/drivers/ata/pata_cs5536.c +++ b/drivers/ata/pata_cs5536.c @@ -289,6 +289,7 @@ static int cs5536_init_one(struct pci_dev *dev, const struct pci_device_id *id) static const struct pci_device_id cs5536[] = { { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5536_IDE), }, + { PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_CS5536_DEV_IDE), }, { }, }; diff --git a/include/linux/pci_ids.h b/include/linux/pci_ids.h index 37f05cb1dfd6..1af616138d1d 100644 --- a/include/linux/pci_ids.h +++ b/include/linux/pci_ids.h @@ -573,6 +573,7 @@ #define PCI_DEVICE_ID_AMD_CS5536_EHC 0x2095 #define PCI_DEVICE_ID_AMD_CS5536_UDC 0x2096 #define PCI_DEVICE_ID_AMD_CS5536_UOC 0x2097 +#define PCI_DEVICE_ID_AMD_CS5536_DEV_IDE 0x2092 #define PCI_DEVICE_ID_AMD_CS5536_IDE 0x209A #define PCI_DEVICE_ID_AMD_LX_VIDEO 0x2081 #define PCI_DEVICE_ID_AMD_LX_AES 0x2082 From 0d7592a03b8abd032afa19aed99ab89dec7f394a Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Tue, 15 Aug 2017 22:41:08 -0700 Subject: [PATCH 129/546] scsi: sg: protect against races between mmap() and SG_SET_RESERVED_SIZE commit 6a8dadcca81fceff9976e8828cceb072873b7bd5 upstream. Take f_mutex around mmap() processing to protect against races with the SG_SET_RESERVED_SIZE ioctl. Ensure the reserve buffer length remains consistent during the mapping operation, and set the "mmap called" flag to prevent further changes to the reserved buffer size as an atomic operation with the mapping. [mkp: fixed whitespace] Signed-off-by: Todd Poynor Acked-by: Douglas Gilbert Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 8a9e139e2853..fc1d9d33873f 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1254,6 +1254,7 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) unsigned long req_sz, len, sa; Sg_scatter_hold *rsv_schp; int k, length; + int ret = 0; if ((!filp) || (!vma) || (!(sfp = (Sg_fd *) filp->private_data))) return -ENXIO; @@ -1264,8 +1265,11 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) if (vma->vm_pgoff) return -EINVAL; /* want no offset */ rsv_schp = &sfp->reserve; - if (req_sz > rsv_schp->bufflen) - return -ENOMEM; /* cannot map more than reserved buffer */ + mutex_lock(&sfp->f_mutex); + if (req_sz > rsv_schp->bufflen) { + ret = -ENOMEM; /* cannot map more than reserved buffer */ + goto out; + } sa = vma->vm_start; length = 1 << (PAGE_SHIFT + rsv_schp->page_order); @@ -1279,7 +1283,9 @@ sg_mmap(struct file *filp, struct vm_area_struct *vma) vma->vm_flags |= VM_IO | VM_DONTEXPAND | VM_DONTDUMP; vma->vm_private_data = sfp; vma->vm_ops = &sg_mmap_vm_ops; - return 0; +out: + mutex_unlock(&sfp->f_mutex); + return ret; } static void From a2e71dcfb0d47748ba8d947db52e4dafc019c6dc Mon Sep 17 00:00:00 2001 From: Todd Poynor Date: Tue, 15 Aug 2017 21:48:43 -0700 Subject: [PATCH 130/546] scsi: sg: recheck MMAP_IO request length with lock held commit 8d26f491116feaa0b16de370b6a7ba40a40fa0b4 upstream. Commit 1bc0eb044615 ("scsi: sg: protect accesses to 'reserved' page array") adds needed concurrency protection for the "reserve" buffer. Some checks that are initially made outside the lock are replicated once the lock is taken to ensure the checks and resulting decisions are made using consistent state. The check that a request with flag SG_FLAG_MMAP_IO set fits in the reserve buffer also needs to be performed again under the lock to ensure the reserve buffer length compared against matches the value in effect when the request is linked to the reserve buffer. An -ENOMEM should be returned in this case, instead of switching over to an indirect buffer as for non-MMAP_IO requests. Signed-off-by: Todd Poynor Acked-by: Douglas Gilbert Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index fc1d9d33873f..71325972e503 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1757,9 +1757,12 @@ sg_start_req(Sg_request *srp, unsigned char *cmd) !sfp->res_in_use) { sfp->res_in_use = 1; sg_link_reserve(sfp, srp, dxfer_len); - } else if ((hp->flags & SG_FLAG_MMAP_IO) && sfp->res_in_use) { + } else if (hp->flags & SG_FLAG_MMAP_IO) { + res = -EBUSY; /* sfp->res_in_use == 1 */ + if (dxfer_len > rsv_schp->bufflen) + res = -ENOMEM; mutex_unlock(&sfp->f_mutex); - return -EBUSY; + return res; } else { res = sg_build_indirect(req_schp, sfp, dxfer_len); if (res) { From aea7e5ce4a52130b12ea23f15af7ac7aa1ab8ef9 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 4 Jan 2016 03:33:45 +0100 Subject: [PATCH 131/546] drm: adv7511: really enable interrupts for EDID detection commit d0be8584b01160eb6f49e77f8e9c1da286bb4ffb upstream. The interrupts for EDID_READY or DDC_ERROR were never enabled in this driver, so reading EDID always timed out when chip was powered down and interrupts were used. Fix this and also remove clearing the interrupt flags, they are cleared in POWER_DOWN mode anyhow (unlike the interrupt enable flags) according to docs and my tests. Signed-off-by: Wolfram Sang Tested-by: Archit Taneja Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i2c/adv7511.c | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/gpu/drm/i2c/adv7511.c b/drivers/gpu/drm/i2c/adv7511.c index 00416f23b5cb..cc916784b6f0 100644 --- a/drivers/gpu/drm/i2c/adv7511.c +++ b/drivers/gpu/drm/i2c/adv7511.c @@ -362,12 +362,19 @@ static void adv7511_power_on(struct adv7511 *adv7511) { adv7511->current_edid_segment = -1; - regmap_write(adv7511->regmap, ADV7511_REG_INT(0), - ADV7511_INT0_EDID_READY); - regmap_write(adv7511->regmap, ADV7511_REG_INT(1), - ADV7511_INT1_DDC_ERROR); regmap_update_bits(adv7511->regmap, ADV7511_REG_POWER, ADV7511_POWER_POWER_DOWN, 0); + if (adv7511->i2c_main->irq) { + /* + * Documentation says the INT_ENABLE registers are reset in + * POWER_DOWN mode. My 7511w preserved the bits, however. + * Still, let's be safe and stick to the documentation. + */ + regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(0), + ADV7511_INT0_EDID_READY); + regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(1), + ADV7511_INT1_DDC_ERROR); + } /* * Per spec it is allowed to pulse the HDP signal to indicate that the @@ -567,12 +574,14 @@ static int adv7511_get_modes(struct drm_encoder *encoder, /* Reading the EDID only works if the device is powered */ if (!adv7511->powered) { - regmap_write(adv7511->regmap, ADV7511_REG_INT(0), - ADV7511_INT0_EDID_READY); - regmap_write(adv7511->regmap, ADV7511_REG_INT(1), - ADV7511_INT1_DDC_ERROR); regmap_update_bits(adv7511->regmap, ADV7511_REG_POWER, ADV7511_POWER_POWER_DOWN, 0); + if (adv7511->i2c_main->irq) { + regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(0), + ADV7511_INT0_EDID_READY); + regmap_write(adv7511->regmap, ADV7511_REG_INT_ENABLE(1), + ADV7511_INT1_DDC_ERROR); + } adv7511->current_edid_segment = -1; } From c634cecad4c17e9761df9aa1cb22c707a402f4e8 Mon Sep 17 00:00:00 2001 From: Archit Taneja Date: Wed, 15 Jun 2016 16:20:45 +0530 Subject: [PATCH 132/546] drm/bridge: adv7511: Fix mutex deadlock when interrupts are disabled commit f0bfcc22d9822947b0ad3095e8363eab5261864c upstream. When the adv7511 i2c client doesn't have an interrupt line, we observe a deadlock on caused by trying to lock drm device's mode_config.mutex twice in the same context. Here is the sequence that causes it: ioctl DRM_IOCTL_MODE_GETCONNECTOR from userspace drm_mode_getconnector (acquires mode_config mutex) connector->fill_modes() drm_helper_probe_single_connector_modes connector_funcs->get_modes adv7511_encoder_get_modes adv7511_get_edid_block adv7511_irq_process drm_helper_hpd_irq_event (acquires mode_config mutex again) In adv7511_irq_process, don't call drm_helper_hpd_irq_event when not called from the interrupt handler. It doesn't serve any purpose there anyway. Signed-off-by: Archit Taneja Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i2c/adv7511.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i2c/adv7511.c b/drivers/gpu/drm/i2c/adv7511.c index cc916784b6f0..5b7896352498 100644 --- a/drivers/gpu/drm/i2c/adv7511.c +++ b/drivers/gpu/drm/i2c/adv7511.c @@ -429,7 +429,7 @@ static bool adv7511_hpd(struct adv7511 *adv7511) return false; } -static int adv7511_irq_process(struct adv7511 *adv7511) +static int adv7511_irq_process(struct adv7511 *adv7511, bool process_hpd) { unsigned int irq0, irq1; int ret; @@ -445,7 +445,7 @@ static int adv7511_irq_process(struct adv7511 *adv7511) regmap_write(adv7511->regmap, ADV7511_REG_INT(0), irq0); regmap_write(adv7511->regmap, ADV7511_REG_INT(1), irq1); - if (irq0 & ADV7511_INT0_HDP && adv7511->encoder) + if (process_hpd && irq0 & ADV7511_INT0_HDP && adv7511->encoder) drm_helper_hpd_irq_event(adv7511->encoder->dev); if (irq0 & ADV7511_INT0_EDID_READY || irq1 & ADV7511_INT1_DDC_ERROR) { @@ -463,7 +463,7 @@ static irqreturn_t adv7511_irq_handler(int irq, void *devid) struct adv7511 *adv7511 = devid; int ret; - ret = adv7511_irq_process(adv7511); + ret = adv7511_irq_process(adv7511, true); return ret < 0 ? IRQ_NONE : IRQ_HANDLED; } @@ -480,7 +480,7 @@ static int adv7511_wait_for_edid(struct adv7511 *adv7511, int timeout) adv7511->edid_read, msecs_to_jiffies(timeout)); } else { for (; timeout > 0; timeout -= 25) { - ret = adv7511_irq_process(adv7511); + ret = adv7511_irq_process(adv7511, false); if (ret < 0) break; From 9183e45db7774716d056319c237a4185baba19c7 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 16 Jan 2017 16:52:47 -0800 Subject: [PATCH 133/546] drm/bridge: adv7511: Use work_struct to defer hotplug handing to out of irq context commit 518cb7057a59b9441336d2e88a396d52b6ab0cce upstream. I was recently seeing issues with EDID probing, where the logic to wait for the EDID read bit to be set by the IRQ wasn't happening and the code would time out and fail. Digging deeper, I found this was due to the fact that IRQs were disabled as we were running in IRQ context from the HPD signal. Thus this patch changes the logic to handle the HPD signal via a work_struct so we can be out of irq context. With this patch, the EDID probing on hotplug does not time out. Cc: David Airlie Cc: Archit Taneja Cc: Wolfram Sang Cc: Lars-Peter Clausen Cc: Laurent Pinchart Cc: dri-devel@lists.freedesktop.org Reviewed-by: Laurent Pinchart Tested-by: Laurent Pinchart Signed-off-by: John Stultz Signed-off-by: Archit Taneja Link: http://patchwork.freedesktop.org/patch/msgid/1484614372-15342-2-git-send-email-john.stultz@linaro.org Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i2c/adv7511.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i2c/adv7511.c b/drivers/gpu/drm/i2c/adv7511.c index 5b7896352498..20780c2690c6 100644 --- a/drivers/gpu/drm/i2c/adv7511.c +++ b/drivers/gpu/drm/i2c/adv7511.c @@ -36,7 +36,10 @@ struct adv7511 { bool edid_read; wait_queue_head_t wq; + struct work_struct hpd_work; + struct drm_encoder *encoder; + struct drm_connector connector; bool embedded_sync; enum adv7511_sync_polarity vsync_polarity; @@ -429,6 +432,13 @@ static bool adv7511_hpd(struct adv7511 *adv7511) return false; } +static void adv7511_hpd_work(struct work_struct *work) +{ + struct adv7511 *adv7511 = container_of(work, struct adv7511, hpd_work); + + drm_helper_hpd_irq_event(adv7511->connector.dev); +} + static int adv7511_irq_process(struct adv7511 *adv7511, bool process_hpd) { unsigned int irq0, irq1; @@ -446,7 +456,7 @@ static int adv7511_irq_process(struct adv7511 *adv7511, bool process_hpd) regmap_write(adv7511->regmap, ADV7511_REG_INT(1), irq1); if (process_hpd && irq0 & ADV7511_INT0_HDP && adv7511->encoder) - drm_helper_hpd_irq_event(adv7511->encoder->dev); + schedule_work(&adv7511->hpd_work); if (irq0 & ADV7511_INT0_EDID_READY || irq1 & ADV7511_INT1_DDC_ERROR) { adv7511->edid_read = true; @@ -922,6 +932,8 @@ static int adv7511_probe(struct i2c_client *i2c, const struct i2c_device_id *id) if (!adv7511->i2c_edid) return -ENOMEM; + INIT_WORK(&adv7511->hpd_work, adv7511_hpd_work); + if (i2c->irq) { init_waitqueue_head(&adv7511->wq); From e22a4308547cd49232df7c3b3c037b4293448305 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 16 Jan 2017 16:52:48 -0800 Subject: [PATCH 134/546] drm/bridge: adv7511: Switch to using drm_kms_helper_hotplug_event() commit 6d5104c5a6b56385426e15047050584794bb6254 upstream. In chasing down a previous issue with EDID probing from calling drm_helper_hpd_irq_event() from irq context, Laurent noticed that the DRM documentation suggests that drm_kms_helper_hotplug_event() should be used instead. Thus this patch replaces drm_helper_hpd_irq_event() with drm_kms_helper_hotplug_event(), which requires we update the connector.status entry and only call _hotplug_event() when the status changes. Cc: David Airlie Cc: Archit Taneja Cc: Wolfram Sang Cc: Lars-Peter Clausen Cc: Laurent Pinchart Cc: dri-devel@lists.freedesktop.org Reviewed-by: Laurent Pinchart Tested-by: Laurent Pinchart Signed-off-by: John Stultz Signed-off-by: Archit Taneja Link: http://patchwork.freedesktop.org/patch/msgid/1484614372-15342-3-git-send-email-john.stultz@linaro.org Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i2c/adv7511.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/i2c/adv7511.c b/drivers/gpu/drm/i2c/adv7511.c index 20780c2690c6..ebb0fc84955a 100644 --- a/drivers/gpu/drm/i2c/adv7511.c +++ b/drivers/gpu/drm/i2c/adv7511.c @@ -435,8 +435,21 @@ static bool adv7511_hpd(struct adv7511 *adv7511) static void adv7511_hpd_work(struct work_struct *work) { struct adv7511 *adv7511 = container_of(work, struct adv7511, hpd_work); + enum drm_connector_status status; + unsigned int val; + int ret; + ret = regmap_read(adv7511->regmap, ADV7511_REG_STATUS, &val); + if (ret < 0) + status = connector_status_disconnected; + else if (val & ADV7511_STATUS_HPD) + status = connector_status_connected; + else + status = connector_status_disconnected; - drm_helper_hpd_irq_event(adv7511->connector.dev); + if (adv7511->connector.status != status) { + adv7511->connector.status = status; + drm_kms_helper_hotplug_event(adv7511->connector.dev); + } } static int adv7511_irq_process(struct adv7511 *adv7511, bool process_hpd) From f4596ead66a74ca566d1aa17dfa7e42bba97b5f5 Mon Sep 17 00:00:00 2001 From: John Stultz Date: Mon, 16 Jan 2017 16:52:52 -0800 Subject: [PATCH 135/546] drm/bridge: adv7511: Re-write the i2c address before EDID probing commit 3587c856675c45809010c2cee5b21096f6e8e938 upstream. I've found that by just turning the chip on and off via the POWER_DOWN register, I end up getting i2c_transfer errors on HiKey. Investigating further, it turns out that some of the register state in hardware is getting lost, as the device registers are reset when the chip is powered down. Thus this patch simply re-writes the i2c address to the ADV7511_REG_EDID_I2C_ADDR register to ensure its properly set before we try to read the EDID data. Cc: David Airlie Cc: Archit Taneja Cc: Wolfram Sang Cc: Lars-Peter Clausen Cc: Laurent Pinchart Cc: dri-devel@lists.freedesktop.org Reviewed-by: Laurent Pinchart Tested-by: Laurent Pinchart Signed-off-by: John Stultz Signed-off-by: Archit Taneja Link: http://patchwork.freedesktop.org/patch/msgid/1484614372-15342-7-git-send-email-john.stultz@linaro.org Signed-off-by: Thong Ho Signed-off-by: Nhan Nguyen Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i2c/adv7511.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/gpu/drm/i2c/adv7511.c b/drivers/gpu/drm/i2c/adv7511.c index ebb0fc84955a..dba5c0ea0827 100644 --- a/drivers/gpu/drm/i2c/adv7511.c +++ b/drivers/gpu/drm/i2c/adv7511.c @@ -51,6 +51,10 @@ struct adv7511 { struct gpio_desc *gpio_pd; }; +static const int edid_i2c_addr = 0x7e; +static const int packet_i2c_addr = 0x70; +static const int cec_i2c_addr = 0x78; + static struct adv7511 *encoder_to_adv7511(struct drm_encoder *encoder) { return to_encoder_slave(encoder)->slave_priv; @@ -606,6 +610,9 @@ static int adv7511_get_modes(struct drm_encoder *encoder, ADV7511_INT1_DDC_ERROR); } adv7511->current_edid_segment = -1; + /* Reset the EDID_I2C_ADDR register as it might be cleared */ + regmap_write(adv7511->regmap, ADV7511_REG_EDID_I2C_ADDR, + edid_i2c_addr); } edid = drm_do_get_edid(connector, adv7511_get_edid_block, adv7511); @@ -881,10 +888,6 @@ static int adv7511_parse_dt(struct device_node *np, return 0; } -static const int edid_i2c_addr = 0x7e; -static const int packet_i2c_addr = 0x70; -static const int cec_i2c_addr = 0x78; - static int adv7511_probe(struct i2c_client *i2c, const struct i2c_device_id *id) { struct adv7511_link_config link_config; From 693b7f62a439ec11a671eee76cad2e7078dad913 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Tue, 4 Jul 2017 21:49:06 +1000 Subject: [PATCH 136/546] btrfs: resume qgroup rescan on rw remount commit 6c6b5a39c4bf3dbd8cf629c9f5450e983c19dbb9 upstream. Several distributions mount the "proper root" as ro during initrd and then remount it as rw before pivot_root(2). Thus, if a rescan had been aborted by a previous shutdown, the rescan would never be resumed. This issue would manifest itself as several btrfs ioctl(2)s causing the entire machine to hang when btrfs_qgroup_wait_for_completion was hit (due to the fs_info->qgroup_rescan_running flag being set but the rescan itself not being resumed). Notably, Docker's btrfs storage driver makes regular use of BTRFS_QUOTA_CTL_DISABLE and BTRFS_IOC_QUOTA_RESCAN_WAIT (causing this problem to be manifested on boot for some machines). Cc: Jeff Mahoney Fixes: b382a324b60f ("Btrfs: fix qgroup rescan resume on mount") Signed-off-by: Aleksa Sarai Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 5d34a062ca4f..3bd2233737ac 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1727,6 +1727,8 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) goto restore; } + btrfs_qgroup_rescan_resume(fs_info); + if (!fs_info->uuid_root) { btrfs_info(fs_info, "creating UUID tree"); ret = btrfs_create_uuid_tree(fs_info); From 10863607c242e970cfc14c42b35689737c397fe4 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 10 Nov 2016 13:06:39 -0800 Subject: [PATCH 137/546] locktorture: Fix potential memory leak with rw lock test MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit f4dbba591945dc301c302672adefba9e2ec08dc5 upstream. When running locktorture module with the below commands with kmemleak enabled: $ modprobe locktorture torture_type=rw_lock_irq $ rmmod locktorture The below kmemleak got caught: root@10:~# echo scan > /sys/kernel/debug/kmemleak [ 323.197029] kmemleak: 2 new suspected memory leaks (see /sys/kernel/debug/kmemleak) root@10:~# cat /sys/kernel/debug/kmemleak unreferenced object 0xffffffc07592d500 (size 128): comm "modprobe", pid 368, jiffies 4294924118 (age 205.824s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 c3 7b 02 00 00 00 00 00 .........{...... 00 00 00 00 00 00 00 00 d7 9b 02 00 00 00 00 00 ................ backtrace: [] create_object+0x110/0x288 [] kmemleak_alloc+0x58/0xa0 [] __kmalloc+0x234/0x318 [] 0xffffff80006fa130 [] do_one_initcall+0x44/0x138 [] do_init_module+0x68/0x1cc [] load_module+0x1a68/0x22e0 [] SyS_finit_module+0xe0/0xf0 [] el0_svc_naked+0x24/0x28 [] 0xffffffffffffffff unreferenced object 0xffffffc07592d480 (size 128): comm "modprobe", pid 368, jiffies 4294924118 (age 205.824s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 3b 6f 01 00 00 00 00 00 ........;o...... 00 00 00 00 00 00 00 00 23 6a 01 00 00 00 00 00 ........#j...... backtrace: [] create_object+0x110/0x288 [] kmemleak_alloc+0x58/0xa0 [] __kmalloc+0x234/0x318 [] 0xffffff80006fa22c [] do_one_initcall+0x44/0x138 [] do_init_module+0x68/0x1cc [] load_module+0x1a68/0x22e0 [] SyS_finit_module+0xe0/0xf0 [] el0_svc_naked+0x24/0x28 [] 0xffffffffffffffff It is because cxt.lwsa and cxt.lrsa don't get freed in module_exit, so free them in lock_torture_cleanup() and free writer_tasks if reader_tasks is failed at memory allocation. Signed-off-by: Yang Shi Signed-off-by: Paul E. McKenney Reviewed-by: Josh Triplett Cc: 石洋 Signed-off-by: Greg Kroah-Hartman --- kernel/locking/locktorture.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/locking/locktorture.c b/kernel/locking/locktorture.c index 8ef1919d63b2..d580b7d6ee6d 100644 --- a/kernel/locking/locktorture.c +++ b/kernel/locking/locktorture.c @@ -776,6 +776,8 @@ static void lock_torture_cleanup(void) else lock_torture_print_module_parms(cxt.cur_ops, "End of test: SUCCESS"); + kfree(cxt.lwsa); + kfree(cxt.lrsa); torture_cleanup_end(); } @@ -917,6 +919,8 @@ static int __init lock_torture_init(void) GFP_KERNEL); if (reader_tasks == NULL) { VERBOSE_TOROUT_ERRSTRING("reader_tasks: Out of memory"); + kfree(writer_tasks); + writer_tasks = NULL; firsterr = -ENOMEM; goto unwind; } From 55681470154567b4a8a30ec8b35a8ebd5a4f3608 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Thu, 6 Jul 2017 12:34:40 +0200 Subject: [PATCH 138/546] ALSA: msnd: Optimize / harden DSP and MIDI loops commit 20e2b791796bd68816fa115f12be5320de2b8021 upstream. The ISA msnd drivers have loops fetching the ring-buffer head, tail and size values inside the loops. Such codes are inefficient and fragile. This patch optimizes it, and also adds the sanity check to avoid the endless loops. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196131 Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=196133 Signed-off-by: Takashi Iwai Signed-off-by: grygorii tertychnyi Signed-off-by: Greg Kroah-Hartman --- sound/isa/msnd/msnd_midi.c | 28 ++++++++++++++-------------- sound/isa/msnd/msnd_pinnacle.c | 23 ++++++++++++----------- 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/sound/isa/msnd/msnd_midi.c b/sound/isa/msnd/msnd_midi.c index ffc67fd80c23..58e59cd3c95c 100644 --- a/sound/isa/msnd/msnd_midi.c +++ b/sound/isa/msnd/msnd_midi.c @@ -120,24 +120,24 @@ void snd_msndmidi_input_read(void *mpuv) unsigned long flags; struct snd_msndmidi *mpu = mpuv; void *pwMIDQData = mpu->dev->mappedbase + MIDQ_DATA_BUFF; + u16 head, tail, size; spin_lock_irqsave(&mpu->input_lock, flags); - while (readw(mpu->dev->MIDQ + JQS_wTail) != - readw(mpu->dev->MIDQ + JQS_wHead)) { - u16 wTmp, val; - val = readw(pwMIDQData + 2 * readw(mpu->dev->MIDQ + JQS_wHead)); + head = readw(mpu->dev->MIDQ + JQS_wHead); + tail = readw(mpu->dev->MIDQ + JQS_wTail); + size = readw(mpu->dev->MIDQ + JQS_wSize); + if (head > size || tail > size) + goto out; + while (head != tail) { + unsigned char val = readw(pwMIDQData + 2 * head); - if (test_bit(MSNDMIDI_MODE_BIT_INPUT_TRIGGER, - &mpu->mode)) - snd_rawmidi_receive(mpu->substream_input, - (unsigned char *)&val, 1); - - wTmp = readw(mpu->dev->MIDQ + JQS_wHead) + 1; - if (wTmp > readw(mpu->dev->MIDQ + JQS_wSize)) - writew(0, mpu->dev->MIDQ + JQS_wHead); - else - writew(wTmp, mpu->dev->MIDQ + JQS_wHead); + if (test_bit(MSNDMIDI_MODE_BIT_INPUT_TRIGGER, &mpu->mode)) + snd_rawmidi_receive(mpu->substream_input, &val, 1); + if (++head > size) + head = 0; + writew(head, mpu->dev->MIDQ + JQS_wHead); } + out: spin_unlock_irqrestore(&mpu->input_lock, flags); } EXPORT_SYMBOL(snd_msndmidi_input_read); diff --git a/sound/isa/msnd/msnd_pinnacle.c b/sound/isa/msnd/msnd_pinnacle.c index 4c072666115d..a31ea6c22d19 100644 --- a/sound/isa/msnd/msnd_pinnacle.c +++ b/sound/isa/msnd/msnd_pinnacle.c @@ -170,23 +170,24 @@ static irqreturn_t snd_msnd_interrupt(int irq, void *dev_id) { struct snd_msnd *chip = dev_id; void *pwDSPQData = chip->mappedbase + DSPQ_DATA_BUFF; + u16 head, tail, size; /* Send ack to DSP */ /* inb(chip->io + HP_RXL); */ /* Evaluate queued DSP messages */ - while (readw(chip->DSPQ + JQS_wTail) != readw(chip->DSPQ + JQS_wHead)) { - u16 wTmp; - - snd_msnd_eval_dsp_msg(chip, - readw(pwDSPQData + 2 * readw(chip->DSPQ + JQS_wHead))); - - wTmp = readw(chip->DSPQ + JQS_wHead) + 1; - if (wTmp > readw(chip->DSPQ + JQS_wSize)) - writew(0, chip->DSPQ + JQS_wHead); - else - writew(wTmp, chip->DSPQ + JQS_wHead); + head = readw(chip->DSPQ + JQS_wHead); + tail = readw(chip->DSPQ + JQS_wTail); + size = readw(chip->DSPQ + JQS_wSize); + if (head > size || tail > size) + goto out; + while (head != tail) { + snd_msnd_eval_dsp_msg(chip, readw(pwDSPQData + 2 * head)); + if (++head > size) + head = 0; + writew(head, chip->DSPQ + JQS_wHead); } + out: /* Send ack to DSP */ inb(chip->io + HP_RXL); return IRQ_HANDLED; From f7ec367c8ea7021517c9c04b0022c225d2d0785a Mon Sep 17 00:00:00 2001 From: Ben Seri Date: Sat, 9 Sep 2017 23:15:59 +0200 Subject: [PATCH 139/546] Bluetooth: Properly check L2CAP config option output buffer length commit e860d2c904d1a9f38a24eb44c9f34b8f915a6ea3 upstream. Validate the output buffer length for L2CAP config requests and responses to avoid overflowing the stack buffer used for building the option blocks. Signed-off-by: Ben Seri Signed-off-by: Marcel Holtmann Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- net/bluetooth/l2cap_core.c | 80 ++++++++++++++++++++------------------ 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c index 66e8b6ee19a5..357bcd34cf1f 100644 --- a/net/bluetooth/l2cap_core.c +++ b/net/bluetooth/l2cap_core.c @@ -57,7 +57,7 @@ static struct sk_buff *l2cap_build_cmd(struct l2cap_conn *conn, u8 code, u8 ident, u16 dlen, void *data); static void l2cap_send_cmd(struct l2cap_conn *conn, u8 ident, u8 code, u16 len, void *data); -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data); +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size); static void l2cap_send_disconn_req(struct l2cap_chan *chan, int err); static void l2cap_tx(struct l2cap_chan *chan, struct l2cap_ctrl *control, @@ -1462,7 +1462,7 @@ static void l2cap_conn_start(struct l2cap_conn *conn) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -2966,12 +2966,15 @@ static inline int l2cap_get_conf_opt(void **ptr, int *type, int *olen, return len; } -static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) +static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val, size_t size) { struct l2cap_conf_opt *opt = *ptr; BT_DBG("type 0x%2.2x len %u val 0x%lx", type, len, val); + if (size < L2CAP_CONF_OPT_SIZE + len) + return; + opt->type = type; opt->len = len; @@ -2996,7 +2999,7 @@ static void l2cap_add_conf_opt(void **ptr, u8 type, u8 len, unsigned long val) *ptr += L2CAP_CONF_OPT_SIZE + len; } -static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) +static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan, size_t size) { struct l2cap_conf_efs efs; @@ -3024,7 +3027,7 @@ static void l2cap_add_opt_efs(void **ptr, struct l2cap_chan *chan) } l2cap_add_conf_opt(ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, size); } static void l2cap_ack_timeout(struct work_struct *work) @@ -3170,11 +3173,12 @@ static inline void l2cap_txwin_setup(struct l2cap_chan *chan) chan->ack_win = chan->tx_win; } -static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_req *req = data; struct l2cap_conf_rfc rfc = { .mode = chan->mode }; void *ptr = req->data; + void *endptr = data + data_size; u16 size; BT_DBG("chan %p", chan); @@ -3199,7 +3203,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) done: if (chan->imtu != L2CAP_DEFAULT_MTU) - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); switch (chan->mode) { case L2CAP_MODE_BASIC: @@ -3218,7 +3222,7 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) rfc.max_pdu_size = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; case L2CAP_MODE_ERTM: @@ -3238,21 +3242,21 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) L2CAP_DEFAULT_TX_WINDOW); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (test_bit(FLAG_EXT_CTRL, &chan->flags)) l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; @@ -3270,17 +3274,17 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) rfc.max_pdu_size = cpu_to_le16(size); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) - l2cap_add_opt_efs(&ptr, chan); + l2cap_add_opt_efs(&ptr, chan, endptr - ptr); if (chan->conn->feat_mask & L2CAP_FEAT_FCS) if (chan->fcs == L2CAP_FCS_NONE || test_bit(CONF_RECV_NO_FCS, &chan->conf_state)) { chan->fcs = L2CAP_FCS_NONE; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FCS, 1, - chan->fcs); + chan->fcs, endptr - ptr); } break; } @@ -3291,10 +3295,11 @@ static int l2cap_build_conf_req(struct l2cap_chan *chan, void *data) return ptr - data; } -static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) +static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data, size_t data_size) { struct l2cap_conf_rsp *rsp = data; void *ptr = rsp->data; + void *endptr = data + data_size; void *req = chan->conf_req; int len = chan->conf_len; int type, hint, olen; @@ -3396,7 +3401,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); } if (result == L2CAP_CONF_SUCCESS) { @@ -3409,7 +3414,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) chan->omtu = mtu; set_bit(CONF_MTU_DONE, &chan->conf_state); } - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->omtu, endptr - ptr); if (remote_efs) { if (chan->local_stype != L2CAP_SERV_NOTRAFIC && @@ -3423,7 +3428,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } else { /* Send PENDING Conf Rsp */ result = L2CAP_CONF_PENDING; @@ -3456,7 +3461,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); if (test_bit(FLAG_EFS_ENABLE, &chan->flags)) { chan->remote_id = efs.id; @@ -3470,7 +3475,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) le32_to_cpu(efs.sdu_itime); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); } break; @@ -3484,7 +3489,7 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) set_bit(CONF_MODE_DONE, &chan->conf_state); l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, sizeof(rfc), - (unsigned long) &rfc); + (unsigned long) &rfc, endptr - ptr); break; @@ -3506,10 +3511,11 @@ static int l2cap_parse_conf_req(struct l2cap_chan *chan, void *data) } static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, - void *data, u16 *result) + void *data, size_t size, u16 *result) { struct l2cap_conf_req *req = data; void *ptr = req->data; + void *endptr = data + size; int type, olen; unsigned long val; struct l2cap_conf_rfc rfc = { .mode = L2CAP_MODE_BASIC }; @@ -3527,13 +3533,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->imtu = L2CAP_DEFAULT_MIN_MTU; } else chan->imtu = val; - l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu); + l2cap_add_conf_opt(&ptr, L2CAP_CONF_MTU, 2, chan->imtu, endptr - ptr); break; case L2CAP_CONF_FLUSH_TO: chan->flush_to = val; l2cap_add_conf_opt(&ptr, L2CAP_CONF_FLUSH_TO, - 2, chan->flush_to); + 2, chan->flush_to, endptr - ptr); break; case L2CAP_CONF_RFC: @@ -3547,13 +3553,13 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, chan->fcs = 0; l2cap_add_conf_opt(&ptr, L2CAP_CONF_RFC, - sizeof(rfc), (unsigned long) &rfc); + sizeof(rfc), (unsigned long) &rfc, endptr - ptr); break; case L2CAP_CONF_EWS: chan->ack_win = min_t(u16, val, chan->ack_win); l2cap_add_conf_opt(&ptr, L2CAP_CONF_EWS, 2, - chan->tx_win); + chan->tx_win, endptr - ptr); break; case L2CAP_CONF_EFS: @@ -3566,7 +3572,7 @@ static int l2cap_parse_conf_rsp(struct l2cap_chan *chan, void *rsp, int len, return -ECONNREFUSED; l2cap_add_conf_opt(&ptr, L2CAP_CONF_EFS, sizeof(efs), - (unsigned long) &efs); + (unsigned long) &efs, endptr - ptr); break; case L2CAP_CONF_FCS: @@ -3671,7 +3677,7 @@ void __l2cap_connect_rsp_defer(struct l2cap_chan *chan) return; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3879,7 +3885,7 @@ static struct l2cap_chan *l2cap_connect(struct l2cap_conn *conn, u8 buf[128]; set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -3957,7 +3963,7 @@ static int l2cap_connect_create_rsp(struct l2cap_conn *conn, break; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, req), req); + l2cap_build_conf_req(chan, req, sizeof(req)), req); chan->num_conf_req++; break; @@ -4069,7 +4075,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, } /* Complete config. */ - len = l2cap_parse_conf_req(chan, rsp); + len = l2cap_parse_conf_req(chan, rsp, sizeof(rsp)); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto unlock; @@ -4103,7 +4109,7 @@ static inline int l2cap_config_req(struct l2cap_conn *conn, if (!test_and_set_bit(CONF_REQ_SENT, &chan->conf_state)) { u8 buf[64]; l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } @@ -4163,7 +4169,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, char buf[64]; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - buf, &result); + buf, sizeof(buf), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4193,7 +4199,7 @@ static inline int l2cap_config_rsp(struct l2cap_conn *conn, /* throw out any old stored conf requests */ result = L2CAP_CONF_SUCCESS; len = l2cap_parse_conf_rsp(chan, rsp->data, len, - req, &result); + req, sizeof(req), &result); if (len < 0) { l2cap_send_disconn_req(chan, ECONNRESET); goto done; @@ -4770,7 +4776,7 @@ static void l2cap_do_create(struct l2cap_chan *chan, int result, set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(chan->conn, l2cap_get_ident(chan->conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), buf); + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } } @@ -7442,7 +7448,7 @@ static void l2cap_security_cfm(struct hci_conn *hcon, u8 status, u8 encrypt) set_bit(CONF_REQ_SENT, &chan->conf_state); l2cap_send_cmd(conn, l2cap_get_ident(conn), L2CAP_CONF_REQ, - l2cap_build_conf_req(chan, buf), + l2cap_build_conf_req(chan, buf, sizeof(buf)), buf); chan->num_conf_req++; } From 84478477d0b8572c6d267492aaaf49acd6fc4db5 Mon Sep 17 00:00:00 2001 From: Mark Rutland Date: Tue, 22 Aug 2017 11:36:17 +0100 Subject: [PATCH 140/546] ARM: 8692/1: mm: abort uaccess retries upon fatal signal commit 746a272e44141af24a02f6c9b0f65f4c4598ed42 upstream. When there's a fatal signal pending, arm's do_page_fault() implementation returns 0. The intent is that we'll return to the faulting userspace instruction, delivering the signal on the way. However, if we take a fatal signal during fixing up a uaccess, this results in a return to the faulting kernel instruction, which will be instantly retried, resulting in the same fault being taken forever. As the task never reaches userspace, the signal is not delivered, and the task is left unkillable. While the task is stuck in this state, it can inhibit the forward progress of the system. To avoid this, we must ensure that when a fatal signal is pending, we apply any necessary fixup for a faulting kernel instruction. Thus we will return to an error path, and it is up to that code to make forward progress towards delivering the fatal signal. Signed-off-by: Mark Rutland Reviewed-by: Steve Capper Signed-off-by: Russell King Signed-off-by: Greg Kroah-Hartman --- arch/arm/mm/fault.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index c095455d496e..0d20cd594017 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -314,8 +314,11 @@ do_page_fault(unsigned long addr, unsigned int fsr, struct pt_regs *regs) * signal first. We do not need to release the mmap_sem because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (!user_mode(regs)) + goto no_context; return 0; + } /* * Major/minor page fault accounting is only done on the From 677a803640497d489406ba708608aa9de7755bb0 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Sep 2017 21:28:11 -0400 Subject: [PATCH 141/546] NFS: Fix 2 use after free issues in the I/O code commit 196639ebbe63a037fe9a80669140bd292d8bcd80 upstream. The writeback code wants to send a commit after processing the pages, which is why we want to delay releasing the struct path until after that's done. Also, the layout code expects that we do not free the inode before we've put the layout segments in pnfs_writehdr_free() and pnfs_readhdr_free() Fixes: 919e3bd9a875 ("NFS: Ensure we commit after writeback is complete") Fixes: 4714fb51fd03 ("nfs: remove pgio_header refcount, related cleanup") Signed-off-by: Trond Myklebust Signed-off-by: Greg Kroah-Hartman --- fs/nfs/internal.h | 1 - fs/nfs/pagelist.c | 26 ++++++++++++-------------- fs/nfs/pnfs.c | 2 -- 3 files changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 9dea85f7f918..578350fd96e1 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -243,7 +243,6 @@ int nfs_iocounter_wait(struct nfs_io_counter *c); extern const struct nfs_pageio_ops nfs_pgio_rw_ops; struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *); void nfs_pgio_header_free(struct nfs_pgio_header *); -void nfs_pgio_data_destroy(struct nfs_pgio_header *); int nfs_generic_pgio(struct nfs_pageio_descriptor *, struct nfs_pgio_header *); int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr, struct rpc_cred *cred, const struct nfs_rpc_ops *rpc_ops, diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c index 452a011ba0d8..8ebfdd00044b 100644 --- a/fs/nfs/pagelist.c +++ b/fs/nfs/pagelist.c @@ -528,16 +528,6 @@ struct nfs_pgio_header *nfs_pgio_header_alloc(const struct nfs_rw_ops *ops) } EXPORT_SYMBOL_GPL(nfs_pgio_header_alloc); -/* - * nfs_pgio_header_free - Free a read or write header - * @hdr: The header to free - */ -void nfs_pgio_header_free(struct nfs_pgio_header *hdr) -{ - hdr->rw_ops->rw_free_header(hdr); -} -EXPORT_SYMBOL_GPL(nfs_pgio_header_free); - /** * nfs_pgio_data_destroy - make @hdr suitable for reuse * @@ -546,14 +536,24 @@ EXPORT_SYMBOL_GPL(nfs_pgio_header_free); * * @hdr: A header that has had nfs_generic_pgio called */ -void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) +static void nfs_pgio_data_destroy(struct nfs_pgio_header *hdr) { if (hdr->args.context) put_nfs_open_context(hdr->args.context); if (hdr->page_array.pagevec != hdr->page_array.page_array) kfree(hdr->page_array.pagevec); } -EXPORT_SYMBOL_GPL(nfs_pgio_data_destroy); + +/* + * nfs_pgio_header_free - Free a read or write header + * @hdr: The header to free + */ +void nfs_pgio_header_free(struct nfs_pgio_header *hdr) +{ + nfs_pgio_data_destroy(hdr); + hdr->rw_ops->rw_free_header(hdr); +} +EXPORT_SYMBOL_GPL(nfs_pgio_header_free); /** * nfs_pgio_rpcsetup - Set up arguments for a pageio call @@ -671,7 +671,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, u32 midx; set_bit(NFS_IOHDR_REDO, &hdr->flags); - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); /* TODO: Make sure it's right to clean up all mirrors here * and not just hdr->pgio_mirror_idx */ @@ -689,7 +688,6 @@ static int nfs_pgio_error(struct nfs_pageio_descriptor *desc, static void nfs_pgio_release(void *calldata) { struct nfs_pgio_header *hdr = calldata; - nfs_pgio_data_destroy(hdr); hdr->completion_ops->completion(hdr); } diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 3cae0726c1b1..7af7bedd7c02 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -1943,7 +1943,6 @@ pnfs_write_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_write_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } @@ -2059,7 +2058,6 @@ pnfs_read_through_mds(struct nfs_pageio_descriptor *desc, nfs_pageio_reset_read_mds(desc); mirror->pg_recoalesce = 1; } - nfs_pgio_data_destroy(hdr); hdr->release(hdr); } From ad3903434142953a03b84ec8719ce80373a62266 Mon Sep 17 00:00:00 2001 From: Richard Wareing Date: Wed, 13 Sep 2017 09:09:35 +1000 Subject: [PATCH 142/546] xfs: XFS_IS_REALTIME_INODE() should be false if no rt device present commit b31ff3cdf540110da4572e3e29bd172087af65cc upstream. If using a kernel with CONFIG_XFS_RT=y and we set the RHINHERIT flag on a directory in a filesystem that does not have a realtime device and create a new file in that directory, it gets marked as a real time file. When data is written and a fsync is issued, the filesystem attempts to flush a non-existent rt device during the fsync process. This results in a crash dereferencing a null buftarg pointer in xfs_blkdev_issue_flush(): BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 IP: xfs_blkdev_issue_flush+0xd/0x20 ..... Call Trace: xfs_file_fsync+0x188/0x1c0 vfs_fsync_range+0x3b/0xa0 do_fsync+0x3d/0x70 SyS_fsync+0x10/0x20 do_syscall_64+0x4d/0xb0 entry_SYSCALL64_slow_path+0x25/0x25 Setting RT inode flags does not require special privileges so any unprivileged user can cause this oops to occur. To reproduce, confirm kernel is compiled with CONFIG_XFS_RT=y and run: # mkfs.xfs -f /dev/pmem0 # mount /dev/pmem0 /mnt/test # mkdir /mnt/test/foo # xfs_io -c 'chattr +t' /mnt/test/foo # xfs_io -f -c 'pwrite 0 5m' -c fsync /mnt/test/foo/bar Or just run xfstests with MKFS_OPTIONS="-d rtinherit=1" and wait. Kernels built with CONFIG_XFS_RT=n are not exposed to this bug. Fixes: f538d4da8d52 ("[XFS] write barrier support") Signed-off-by: Richard Wareing Signed-off-by: Dave Chinner Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/xfs/xfs_linux.h | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/fs/xfs/xfs_linux.h b/fs/xfs/xfs_linux.h index ec0e239a0fa9..201aae0b2662 100644 --- a/fs/xfs/xfs_linux.h +++ b/fs/xfs/xfs_linux.h @@ -369,7 +369,14 @@ static inline __uint64_t howmany_64(__uint64_t x, __uint32_t y) #endif /* DEBUG */ #ifdef CONFIG_XFS_RT -#define XFS_IS_REALTIME_INODE(ip) ((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) + +/* + * make sure we ignore the inode flag if the filesystem doesn't have a + * configured realtime device. + */ +#define XFS_IS_REALTIME_INODE(ip) \ + (((ip)->i_d.di_flags & XFS_DIFLAG_REALTIME) && \ + (ip)->i_mount->m_rtdev_targp) #else #define XFS_IS_REALTIME_INODE(ip) (0) #endif From b52c9082f2eb3a6f7fbbc86fad3eaa2a1725da66 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 13 Sep 2017 14:10:05 -0700 Subject: [PATCH 143/546] Linux 4.4.88 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index f6838187b568..788d90a0051b 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 87 +SUBLEVEL = 88 EXTRAVERSION = NAME = Blurry Fish Butt From 83ff4b01ca2c622644a06857bc645fdb72393d0c Mon Sep 17 00:00:00 2001 From: Yang Jin Date: Wed, 26 Jul 2017 12:52:22 -0700 Subject: [PATCH 144/546] uid_sys_stats: log task io with a debug flag Add a hashmap inside each uid_entry to keep track of task name and io. Task full name is a combination of thread and process name. Bug: 63739275 Change-Id: I30083b757eaef8c61e55a213a883ce8d0c9cf2b1 Signed-off-by: Yang Jin --- drivers/misc/Kconfig | 7 + drivers/misc/uid_sys_stats.c | 303 ++++++++++++++++++++++++++++++----- 2 files changed, 266 insertions(+), 44 deletions(-) diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig index 9e649992c177..88056d1e8feb 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -533,6 +533,13 @@ config UID_SYS_STATS Per UID based io statistics exported to /proc/uid_io Per UID based procstat control in /proc/uid_procstat +config UID_SYS_STATS_DEBUG + bool "Per-TASK statistics" + depends on UID_SYS_STATS + default n + help + Per TASK based io statistics exported to /proc/uid_io + config MEMORY_STATE_TIME tristate "Memory freq/bandwidth time statistics" depends on PROFILING diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 3c9d311106cd..2e4364b96b9a 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -52,6 +52,15 @@ struct io_stats { #define UID_STATE_DEAD_TASKS 4 #define UID_STATE_SIZE 5 +#define MAX_TASK_COMM_LEN 256 + +struct task_entry { + char comm[MAX_TASK_COMM_LEN]; + pid_t pid; + struct io_stats io[UID_STATE_SIZE]; + struct hlist_node hash; +}; + struct uid_entry { uid_t uid; cputime_t utime; @@ -61,8 +70,231 @@ struct uid_entry { int state; struct io_stats io[UID_STATE_SIZE]; struct hlist_node hash; +#ifdef CONFIG_UID_SYS_STATS_DEBUG + DECLARE_HASHTABLE(task_entries, UID_HASH_BITS); +#endif }; +static u64 compute_write_bytes(struct task_struct *task) +{ + if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes) + return 0; + + return task->ioac.write_bytes - task->ioac.cancelled_write_bytes; +} + +static void compute_io_bucket_stats(struct io_stats *io_bucket, + struct io_stats *io_curr, + struct io_stats *io_last, + struct io_stats *io_dead) +{ + /* tasks could switch to another uid group, but its io_last in the + * previous uid group could still be positive. + * therefore before each update, do an overflow check first + */ + int64_t delta; + + delta = io_curr->read_bytes + io_dead->read_bytes - + io_last->read_bytes; + io_bucket->read_bytes += delta > 0 ? delta : 0; + delta = io_curr->write_bytes + io_dead->write_bytes - + io_last->write_bytes; + io_bucket->write_bytes += delta > 0 ? delta : 0; + delta = io_curr->rchar + io_dead->rchar - io_last->rchar; + io_bucket->rchar += delta > 0 ? delta : 0; + delta = io_curr->wchar + io_dead->wchar - io_last->wchar; + io_bucket->wchar += delta > 0 ? delta : 0; + delta = io_curr->fsync + io_dead->fsync - io_last->fsync; + io_bucket->fsync += delta > 0 ? delta : 0; + + io_last->read_bytes = io_curr->read_bytes; + io_last->write_bytes = io_curr->write_bytes; + io_last->rchar = io_curr->rchar; + io_last->wchar = io_curr->wchar; + io_last->fsync = io_curr->fsync; + + memset(io_dead, 0, sizeof(struct io_stats)); +} + +#ifdef CONFIG_UID_SYS_STATS_DEBUG +static void get_full_task_comm(struct task_entry *task_entry, + struct task_struct *task) +{ + int i = 0, offset = 0, len = 0; + /* save one byte for terminating null character */ + int unused_len = MAX_TASK_COMM_LEN - TASK_COMM_LEN - 1; + char buf[unused_len]; + struct mm_struct *mm = task->mm; + + /* fill the first TASK_COMM_LEN bytes with thread name */ + get_task_comm(task_entry->comm, task); + i = strlen(task_entry->comm); + while (i < TASK_COMM_LEN) + task_entry->comm[i++] = ' '; + + /* next the executable file name */ + if (mm) { + down_read(&mm->mmap_sem); + if (mm->exe_file) { + char *pathname = d_path(&mm->exe_file->f_path, buf, + unused_len); + + if (!IS_ERR(pathname)) { + len = strlcpy(task_entry->comm + i, pathname, + unused_len); + i += len; + task_entry->comm[i++] = ' '; + unused_len--; + } + } + up_read(&mm->mmap_sem); + } + unused_len -= len; + + /* fill the rest with command line argument + * replace each null or new line character + * between args in argv with whitespace */ + len = get_cmdline(task, buf, unused_len); + while (offset < len) { + if (buf[offset] != '\0' && buf[offset] != '\n') + task_entry->comm[i++] = buf[offset]; + else + task_entry->comm[i++] = ' '; + offset++; + } + + /* get rid of trailing whitespaces in case when arg is memset to + * zero before being reset in userspace + */ + while (task_entry->comm[i-1] == ' ') + i--; + task_entry->comm[i] = '\0'; +} + +static struct task_entry *find_task_entry(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct task_entry *task_entry; + + hash_for_each_possible(uid_entry->task_entries, task_entry, hash, + task->pid) { + if (task->pid == task_entry->pid) { + /* if thread name changed, update the entire command */ + int len = strnchr(task_entry->comm, ' ', TASK_COMM_LEN) + - task_entry->comm; + + if (strncmp(task_entry->comm, task->comm, len)) + get_full_task_comm(task_entry, task); + return task_entry; + } + } + return NULL; +} + +static struct task_entry *find_or_register_task(struct uid_entry *uid_entry, + struct task_struct *task) +{ + struct task_entry *task_entry; + pid_t pid = task->pid; + + task_entry = find_task_entry(uid_entry, task); + if (task_entry) + return task_entry; + + task_entry = kzalloc(sizeof(struct task_entry), GFP_ATOMIC); + if (!task_entry) + return NULL; + + get_full_task_comm(task_entry, task); + + task_entry->pid = pid; + hash_add(uid_entry->task_entries, &task_entry->hash, (unsigned int)pid); + + return task_entry; +} + +static void remove_uid_tasks(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + struct hlist_node *tmp_task; + + hash_for_each_safe(uid_entry->task_entries, bkt_task, + tmp_task, task_entry, hash) { + hash_del(&task_entry->hash); + kfree(task_entry); + } +} + +static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + memset(&task_entry->io[UID_STATE_TOTAL_CURR], 0, + sizeof(struct io_stats)); + } +} + +static void add_uid_tasks_io_stats(struct uid_entry *uid_entry, + struct task_struct *task, int slot) +{ + struct task_entry *task_entry = find_or_register_task(uid_entry, task); + struct io_stats *task_io_slot = &task_entry->io[slot]; + + task_io_slot->read_bytes += task->ioac.read_bytes; + task_io_slot->write_bytes += compute_write_bytes(task); + task_io_slot->rchar += task->ioac.rchar; + task_io_slot->wchar += task->ioac.wchar; + task_io_slot->fsync += task->ioac.syscfs; +} + +static void compute_io_uid_tasks(struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + compute_io_bucket_stats(&task_entry->io[uid_entry->state], + &task_entry->io[UID_STATE_TOTAL_CURR], + &task_entry->io[UID_STATE_TOTAL_LAST], + &task_entry->io[UID_STATE_DEAD_TASKS]); + } +} + +static void show_io_uid_tasks(struct seq_file *m, struct uid_entry *uid_entry) +{ + struct task_entry *task_entry; + unsigned long bkt_task; + + hash_for_each(uid_entry->task_entries, bkt_task, task_entry, hash) { + /* Separated by comma because space exists in task comm */ + seq_printf(m, "task,%s,%lu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu,%llu\n", + task_entry->comm, + (unsigned long)task_entry->pid, + task_entry->io[UID_STATE_FOREGROUND].rchar, + task_entry->io[UID_STATE_FOREGROUND].wchar, + task_entry->io[UID_STATE_FOREGROUND].read_bytes, + task_entry->io[UID_STATE_FOREGROUND].write_bytes, + task_entry->io[UID_STATE_BACKGROUND].rchar, + task_entry->io[UID_STATE_BACKGROUND].wchar, + task_entry->io[UID_STATE_BACKGROUND].read_bytes, + task_entry->io[UID_STATE_BACKGROUND].write_bytes, + task_entry->io[UID_STATE_FOREGROUND].fsync, + task_entry->io[UID_STATE_BACKGROUND].fsync); + } +} +#else +static void remove_uid_tasks(struct uid_entry *uid_entry) {}; +static void set_io_uid_tasks_zero(struct uid_entry *uid_entry) {}; +static void add_uid_tasks_io_stats(struct uid_entry *uid_entry, + struct task_struct *task, int slot) {}; +static void compute_io_uid_tasks(struct uid_entry *uid_entry) {}; +static void show_io_uid_tasks(struct seq_file *m, + struct uid_entry *uid_entry) {} +#endif + static struct uid_entry *find_uid_entry(uid_t uid) { struct uid_entry *uid_entry; @@ -86,7 +318,9 @@ static struct uid_entry *find_or_register_uid(uid_t uid) return NULL; uid_entry->uid = uid; - +#ifdef CONFIG_UID_SYS_STATS_DEBUG + hash_init(uid_entry->task_entries); +#endif hash_add(hash_table, &uid_entry->hash, uid); return uid_entry; @@ -192,6 +426,7 @@ static ssize_t uid_remove_write(struct file *file, hash_for_each_possible_safe(hash_table, uid_entry, tmp, hash, (uid_t)uid_start) { if (uid_start == uid_entry->uid) { + remove_uid_tasks(uid_entry); hash_del(&uid_entry->hash); kfree(uid_entry); } @@ -208,13 +443,6 @@ static const struct file_operations uid_remove_fops = { .write = uid_remove_write, }; -static u64 compute_write_bytes(struct task_struct *task) -{ - if (task->ioac.write_bytes <= task->ioac.cancelled_write_bytes) - return 0; - - return task->ioac.write_bytes - task->ioac.cancelled_write_bytes; -} static void add_uid_io_stats(struct uid_entry *uid_entry, struct task_struct *task, int slot) @@ -226,28 +454,8 @@ static void add_uid_io_stats(struct uid_entry *uid_entry, io_slot->rchar += task->ioac.rchar; io_slot->wchar += task->ioac.wchar; io_slot->fsync += task->ioac.syscfs; -} -static void compute_uid_io_bucket_stats(struct io_stats *io_bucket, - struct io_stats *io_curr, - struct io_stats *io_last, - struct io_stats *io_dead) -{ - io_bucket->read_bytes += io_curr->read_bytes + io_dead->read_bytes - - io_last->read_bytes; - io_bucket->write_bytes += io_curr->write_bytes + io_dead->write_bytes - - io_last->write_bytes; - io_bucket->rchar += io_curr->rchar + io_dead->rchar - io_last->rchar; - io_bucket->wchar += io_curr->wchar + io_dead->wchar - io_last->wchar; - io_bucket->fsync += io_curr->fsync + io_dead->fsync - io_last->fsync; - - io_last->read_bytes = io_curr->read_bytes; - io_last->write_bytes = io_curr->write_bytes; - io_last->rchar = io_curr->rchar; - io_last->wchar = io_curr->wchar; - io_last->fsync = io_curr->fsync; - - memset(io_dead, 0, sizeof(struct io_stats)); + add_uid_tasks_io_stats(uid_entry, task, slot); } static void update_io_stats_all_locked(void) @@ -258,9 +466,11 @@ static void update_io_stats_all_locked(void) unsigned long bkt; uid_t uid; - hash_for_each(hash_table, bkt, uid_entry, hash) + hash_for_each(hash_table, bkt, uid_entry, hash) { memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, sizeof(struct io_stats)); + set_io_uid_tasks_zero(uid_entry); + } rcu_read_lock(); do_each_thread(temp, task) { @@ -274,10 +484,11 @@ static void update_io_stats_all_locked(void) rcu_read_unlock(); hash_for_each(hash_table, bkt, uid_entry, hash) { - compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state], + compute_io_bucket_stats(&uid_entry->io[uid_entry->state], &uid_entry->io[UID_STATE_TOTAL_CURR], &uid_entry->io[UID_STATE_TOTAL_LAST], &uid_entry->io[UID_STATE_DEAD_TASKS]); + compute_io_uid_tasks(uid_entry); } } @@ -288,6 +499,7 @@ static void update_io_stats_uid_locked(struct uid_entry *uid_entry) memset(&uid_entry->io[UID_STATE_TOTAL_CURR], 0, sizeof(struct io_stats)); + set_io_uid_tasks_zero(uid_entry); rcu_read_lock(); do_each_thread(temp, task) { @@ -297,12 +509,14 @@ static void update_io_stats_uid_locked(struct uid_entry *uid_entry) } while_each_thread(temp, task); rcu_read_unlock(); - compute_uid_io_bucket_stats(&uid_entry->io[uid_entry->state], + compute_io_bucket_stats(&uid_entry->io[uid_entry->state], &uid_entry->io[UID_STATE_TOTAL_CURR], &uid_entry->io[UID_STATE_TOTAL_LAST], &uid_entry->io[UID_STATE_DEAD_TASKS]); + compute_io_uid_tasks(uid_entry); } + static int uid_io_show(struct seq_file *m, void *v) { struct uid_entry *uid_entry; @@ -314,21 +528,22 @@ static int uid_io_show(struct seq_file *m, void *v) hash_for_each(hash_table, bkt, uid_entry, hash) { seq_printf(m, "%d %llu %llu %llu %llu %llu %llu %llu %llu %llu %llu\n", - uid_entry->uid, - uid_entry->io[UID_STATE_FOREGROUND].rchar, - uid_entry->io[UID_STATE_FOREGROUND].wchar, - uid_entry->io[UID_STATE_FOREGROUND].read_bytes, - uid_entry->io[UID_STATE_FOREGROUND].write_bytes, - uid_entry->io[UID_STATE_BACKGROUND].rchar, - uid_entry->io[UID_STATE_BACKGROUND].wchar, - uid_entry->io[UID_STATE_BACKGROUND].read_bytes, - uid_entry->io[UID_STATE_BACKGROUND].write_bytes, - uid_entry->io[UID_STATE_FOREGROUND].fsync, - uid_entry->io[UID_STATE_BACKGROUND].fsync); + uid_entry->uid, + uid_entry->io[UID_STATE_FOREGROUND].rchar, + uid_entry->io[UID_STATE_FOREGROUND].wchar, + uid_entry->io[UID_STATE_FOREGROUND].read_bytes, + uid_entry->io[UID_STATE_FOREGROUND].write_bytes, + uid_entry->io[UID_STATE_BACKGROUND].rchar, + uid_entry->io[UID_STATE_BACKGROUND].wchar, + uid_entry->io[UID_STATE_BACKGROUND].read_bytes, + uid_entry->io[UID_STATE_BACKGROUND].write_bytes, + uid_entry->io[UID_STATE_FOREGROUND].fsync, + uid_entry->io[UID_STATE_BACKGROUND].fsync); + + show_io_uid_tasks(m, uid_entry); } rt_mutex_unlock(&uid_lock); - return 0; } From e86c44165defb84e396ec1ae589096540bff3244 Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Thu, 17 Aug 2017 18:52:04 +0530 Subject: [PATCH 145/546] ANDROID: uid_sys_stats: Fix implicit declaration of get_cmdline() Include linux/mm.h for get_cmdline() declaration. Change-Id: Icad6ef7deef4d93d92d423c96bfa61fb5d66d0b7 Fixes: Change-Id: I30083b757eaef8c61e55a213a883ce8d0c9cf2b1 ("uid_sys_stats: log task io with a debug flag") Signed-off-by: Amit Pundir --- drivers/misc/uid_sys_stats.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/misc/uid_sys_stats.c b/drivers/misc/uid_sys_stats.c index 2e4364b96b9a..031320e51522 100644 --- a/drivers/misc/uid_sys_stats.c +++ b/drivers/misc/uid_sys_stats.c @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include From 0ee4eb5c9aa3821e5ca12c311236364b6e30f8cd Mon Sep 17 00:00:00 2001 From: Danilo Krummrich Date: Mon, 14 Aug 2017 12:45:38 +0200 Subject: [PATCH 146/546] ANDROID: usb: gadget: configfs: fix null ptr in android_disconnect There's a race between usb_gadget_udc_stop() which is likely to set the gadget driver to NULL in the udc driver and this drivers gadget disconnect fn which likely checks for the gadget driver to a null ptr. It happens that unbind (doing set_gadget_data(NULL)) is called before the gadget driver is set to NULL and the udc driver calls disconnect fn which results in cdev being a null ptr. As a workaround we check cdev in android_disconnect() to prevent the following panic: Unable to handle kernel NULL pointer dereference at virtual address 000000a8 pgd = ffffff800940a000 [000000a8] *pgd=00000000be1fe003, *pud=00000000be1fe003, *pmd=0000000000000000 Internal error: Oops: 96000046 [#1] PREEMPT SMP CPU: 7 PID: 1134 Comm: kworker/u16:3 Tainted: G S 4.9.41-g75cd2a0231ea-dirty #4 Hardware name: HiKey960 (DT) Workqueue: events_power_efficient event_work task: ffffffc0b5f4f000 task.stack: ffffffc0b5b94000 PC is at android_disconnect+0x54/0xa4 LR is at android_disconnect+0x54/0xa4 pc : [] lr : [] pstate: 80000185 sp : ffffffc0b5b97bf0 x29: ffffffc0b5b97bf0 x28: 0000000000000003 x27: ffffffc0b5181c54 x26: ffffffc0b5181c68 x25: ffffff8008dc1000 x24: ffffffc0b5181d70 x23: ffffff8008dc18a0 x22: ffffffc0b5f5a018 x21: ffffffc0b5894ad8 x20: 0000000000000000 x19: ffffff8008ddaec8 x18: 0000000000000000 x17: 0000000000000000 x16: 0000000000000000 x15: 0000000000000000 x14: 00000000007c9ccd x13: 0000000000000000 x12: 0000000000000000 x11: 0000000000000001 x10: 0000000000000001 x9 : ffffff800930f1a8 x8 : ffffff800932a133 x7 : 0000000000000000 x6 : 0000000000000000 x5 : ffffffc0b5b97a50 x4 : ffffffc0be19f090 x3 : 0000000000000000 x2 : ffffff80091ca000 x1 : 000000000000002f x0 : 000000000000002f This happened on a hikey960 with the following backtrace: [] android_disconnect+0x54/0xa4 [] dwc3_disconnect_gadget.part.19+0x114.888119] [] dwc3_gadget_suspend+0x6c/0x70 [] dwc3_suspend_device+0x58/0xa0 [] dwc3_otg_work+0x214/0x474 [] event_work+0x3bc/0x5ac [] process_one_work+0x14c/0x43c [] worker_thread+0x5c/0x438 [] kthread+0xec/0x100 [] ret_from_fork+0x10/0x50 dwc3_otg_work tries to handle a switch from host to device mode and therefore calls disconnect on the gadget driver. To reproduce the issue it is enaugh to enable tethering (rndis gadget), unplug and plug in again the usb connector which causes the change from device to host and back to device mode. Signed-off-by: Danilo Krummrich --- drivers/usb/gadget/configfs.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/drivers/usb/gadget/configfs.c b/drivers/usb/gadget/configfs.c index 54849fe9cb01..a0bd3c542d06 100644 --- a/drivers/usb/gadget/configfs.c +++ b/drivers/usb/gadget/configfs.c @@ -1524,6 +1524,18 @@ static void android_disconnect(struct usb_gadget *gadget) struct usb_composite_dev *cdev = get_gadget_data(gadget); struct gadget_info *gi = container_of(cdev, struct gadget_info, cdev); + /* FIXME: There's a race between usb_gadget_udc_stop() which is likely + * to set the gadget driver to NULL in the udc driver and this drivers + * gadget disconnect fn which likely checks for the gadget driver to + * be a null ptr. It happens that unbind (doing set_gadget_data(NULL)) + * is called before the gadget driver is set to NULL and the udc driver + * calls disconnect fn which results in cdev being a null ptr. + */ + if (cdev == NULL) { + WARN(1, "%s: gadget driver already disconnected\n", __func__); + return; + } + /* accessory HID support can be active while the accessory function is not actually enabled, so we need to inform it when we are disconnected. From 0b2b06279f10756549ef33dad85cc1d738e67e43 Mon Sep 17 00:00:00 2001 From: Badhri Jagan Sridharan Date: Fri, 21 Jul 2017 14:58:16 -0700 Subject: [PATCH 147/546] ANDROID: usb: gadget: assign no-op request complete callbacks The req->complete seems to presist the callback pointer for the control requests. This causes the serial of the accessory to be overridden when an accessory function specific out control request is issued right after the ACCESSORY_SEND_STRING control request. Therefore, assign a no-op req complete function when nothing needs to be done once the request is completed. Signed-off-by: Badhri Jagan Sridharan Bug: 63867169 Change-Id: I78c1602e9a044b8718b270b8a068cf5afc83f984 --- drivers/usb/gadget/function/f_accessory.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/function/f_accessory.c b/drivers/usb/gadget/function/f_accessory.c index 76b8ae08a551..d31c0809046f 100644 --- a/drivers/usb/gadget/function/f_accessory.c +++ b/drivers/usb/gadget/function/f_accessory.c @@ -812,6 +812,14 @@ static struct hid_driver acc_hid_driver = { .probe = acc_hid_probe, }; +static void acc_complete_setup_noop(struct usb_ep *ep, struct usb_request *req) +{ + /* + * Default no-op function when nothing needs to be done for the + * setup request + */ +} + int acc_ctrlrequest(struct usb_composite_dev *cdev, const struct usb_ctrlrequest *ctrl) { @@ -839,6 +847,7 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, schedule_delayed_work( &dev->start_work, msecs_to_jiffies(10)); value = 0; + cdev->req->complete = acc_complete_setup_noop; } else if (b_request == ACCESSORY_SEND_STRING) { dev->string_index = w_index; cdev->gadget->ep0->driver_data = dev; @@ -847,10 +856,13 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, } else if (b_request == ACCESSORY_SET_AUDIO_MODE && w_index == 0 && w_length == 0) { dev->audio_mode = w_value; + cdev->req->complete = acc_complete_setup_noop; value = 0; } else if (b_request == ACCESSORY_REGISTER_HID) { + cdev->req->complete = acc_complete_setup_noop; value = acc_register_hid(dev, w_value, w_index); } else if (b_request == ACCESSORY_UNREGISTER_HID) { + cdev->req->complete = acc_complete_setup_noop; value = acc_unregister_hid(dev, w_value); } else if (b_request == ACCESSORY_SET_HID_REPORT_DESC) { spin_lock_irqsave(&dev->lock, flags); @@ -885,7 +897,7 @@ int acc_ctrlrequest(struct usb_composite_dev *cdev, if (b_request == ACCESSORY_GET_PROTOCOL) { *((u16 *)cdev->req->buf) = PROTOCOL_VERSION; value = sizeof(u16); - + cdev->req->complete = acc_complete_setup_noop; /* clear any string left over from a previous session */ memset(dev->manufacturer, 0, sizeof(dev->manufacturer)); memset(dev->model, 0, sizeof(dev->model)); From 2c47fb2bb66b8d30c77a84d29064726d487711e8 Mon Sep 17 00:00:00 2001 From: Felipe Balbi Date: Wed, 2 Dec 2015 10:06:45 -0600 Subject: [PATCH 148/546] BACKPORT: usb: dwc3: gadget: handle request->zero So far, dwc3 has always missed request->zero handling for every endpoint. Let's implement that so we can handle cases where transfer must be finished with a ZLP. Note that dwc3 is a little special. Even though we're dealing with a ZLP, we still need a buffer of wMaxPacketSize bytes; to hide that detail from every gadget driver, we have a preallocated buffer of 1024 bytes (biggest bulk size) to use (and share) among all endpoints. (cherry-picked from commit 04c03d10e507052cfce6910ddf34091196e79e1c) Reported-by: Ravi B Signed-off-by: Felipe Balbi Signed-off-by: Badhri Jagan Sridharan Bug: 63867169 Change-Id: Ied4093e92dd080f2d9dd1fd9aefb36406f66bb67 --- drivers/usb/dwc3/core.h | 3 +++ drivers/usb/dwc3/gadget.c | 49 +++++++++++++++++++++++++++++++++++++-- 2 files changed, 50 insertions(+), 2 deletions(-) diff --git a/drivers/usb/dwc3/core.h b/drivers/usb/dwc3/core.h index 68d11d7d4028..d5b604a2c325 100644 --- a/drivers/usb/dwc3/core.h +++ b/drivers/usb/dwc3/core.h @@ -37,6 +37,7 @@ #define DWC3_MSG_MAX 500 /* Global constants */ +#define DWC3_ZLP_BUF_SIZE 1024 /* size of a superspeed bulk */ #define DWC3_EP0_BOUNCE_SIZE 512 #define DWC3_ENDPOINTS_NUM 32 #define DWC3_XHCI_RESOURCES_NUM 2 @@ -645,6 +646,7 @@ struct dwc3_scratchpad_array { * @ctrl_req: usb control request which is used for ep0 * @ep0_trb: trb which is used for the ctrl_req * @ep0_bounce: bounce buffer for ep0 + * @zlp_buf: used when request->zero is set * @setup_buf: used while precessing STD USB requests * @ctrl_req_addr: dma address of ctrl_req * @ep0_trb: dma address of ep0_trb @@ -732,6 +734,7 @@ struct dwc3 { struct usb_ctrlrequest *ctrl_req; struct dwc3_trb *ep0_trb; void *ep0_bounce; + void *zlp_buf; void *scratchbuf; u8 *setup_buf; dma_addr_t ctrl_req_addr; diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index d3bd1afd6302..c0d21367d107 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1200,6 +1200,32 @@ static int __dwc3_gadget_ep_queue(struct dwc3_ep *dep, struct dwc3_request *req) return ret; } +static void __dwc3_gadget_ep_zlp_complete(struct usb_ep *ep, + struct usb_request *request) +{ + dwc3_gadget_ep_free_request(ep, request); +} + +static int __dwc3_gadget_ep_queue_zlp(struct dwc3 *dwc, struct dwc3_ep *dep) +{ + struct dwc3_request *req; + struct usb_request *request; + struct usb_ep *ep = &dep->endpoint; + + dwc3_trace(trace_dwc3_gadget, "queueing ZLP\n"); + request = dwc3_gadget_ep_alloc_request(ep, GFP_ATOMIC); + if (!request) + return -ENOMEM; + + request->length = 0; + request->buf = dwc->zlp_buf; + request->complete = __dwc3_gadget_ep_zlp_complete; + + req = to_dwc3_request(request); + + return __dwc3_gadget_ep_queue(dep, req); +} + static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, gfp_t gfp_flags) { @@ -1227,6 +1253,15 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, ret = __dwc3_gadget_ep_queue(dep, req); + /* + * Okay, here's the thing, if gadget driver has requested for a ZLP by + * setting request->zero, instead of doing magic, we will just queue an + * extra usb_request ourselves so that it gets handled the same way as + * any other request. + */ + if (ret == 0 && request->zero && (request->length % ep->maxpacket == 0)) + ret = __dwc3_gadget_ep_queue_zlp(dwc, dep); + out: spin_unlock_irqrestore(&dwc->lock, flags); @@ -2798,6 +2833,12 @@ int dwc3_gadget_init(struct dwc3 *dwc) goto err3; } + dwc->zlp_buf = kzalloc(DWC3_ZLP_BUF_SIZE, GFP_KERNEL); + if (!dwc->zlp_buf) { + ret = -ENOMEM; + goto err4; + } + dwc->gadget.ops = &dwc3_gadget_ops; dwc->gadget.speed = USB_SPEED_UNKNOWN; dwc->gadget.sg_supported = true; @@ -2839,16 +2880,19 @@ int dwc3_gadget_init(struct dwc3 *dwc) ret = dwc3_gadget_init_endpoints(dwc); if (ret) - goto err4; + goto err5; ret = usb_add_gadget_udc(dwc->dev, &dwc->gadget); if (ret) { dev_err(dwc->dev, "failed to register udc\n"); - goto err4; + goto err5; } return 0; +err5: + kfree(dwc->zlp_buf); + err4: dwc3_gadget_free_endpoints(dwc); dma_free_coherent(dwc->dev, DWC3_EP0_BOUNCE_SIZE, @@ -2881,6 +2925,7 @@ void dwc3_gadget_exit(struct dwc3 *dwc) dwc->ep0_bounce, dwc->ep0_bounce_addr); kfree(dwc->setup_buf); + kfree(dwc->zlp_buf); dma_free_coherent(dwc->dev, sizeof(*dwc->ep0_trb) * 2, dwc->ep0_trb, dwc->ep0_trb_addr); From a47cfd858588971aad3c60aac96921b0f65fcaa5 Mon Sep 17 00:00:00 2001 From: John Youn Date: Tue, 22 Dec 2015 12:23:20 -0800 Subject: [PATCH 149/546] UPSTREAM: usb: dwc3: gadget: don't send extra ZLP If the request->length is zero, a ZLP should already be sent due to that and another ZLP is not needed to terminate the transfer. (cherry-picked from commit d9261898a4b2c143c28568dc686a1becfc637a99) Fixes: 04c03d10e507 ("usb: dwc3: gadget: handle request->zero") Signed-off-by: John Youn Signed-off-by: Felipe Balbi Signed-off-by: Badhri Jagan Sridharan Bug: 63867169 Change-Id: I97a1801c57dc169b6e9371d5aec599a14f316aff --- drivers/usb/dwc3/gadget.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/usb/dwc3/gadget.c b/drivers/usb/dwc3/gadget.c index c0d21367d107..3368c3a88a57 100644 --- a/drivers/usb/dwc3/gadget.c +++ b/drivers/usb/dwc3/gadget.c @@ -1259,7 +1259,8 @@ static int dwc3_gadget_ep_queue(struct usb_ep *ep, struct usb_request *request, * extra usb_request ourselves so that it gets handled the same way as * any other request. */ - if (ret == 0 && request->zero && (request->length % ep->maxpacket == 0)) + if (ret == 0 && request->zero && request->length && + (request->length % ep->maxpacket == 0)) ret = __dwc3_gadget_ep_queue_zlp(dwc, dep); out: From e86be20135e3a33365ed38ce8626f88833cbdd83 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 10:43:14 -0700 Subject: [PATCH 150/546] ANDROID: NFC: st21nfca: Fix out of bounds kernel access when handling ATR_REQ Out of bounds kernel accesses in st21nfca's NFC HCI layer might happen when handling ATR_REQ events if user-specified atr_req->length is bigger than the buffer size. In that case memcpy() inside st21nfca_tm_send_atr_res() will read extra bytes resulting in OOB read from the kernel heap. Bug: 62679012 Signed-off-by: Suren Baghdasaryan --- drivers/nfc/st21nfca/dep.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/nfc/st21nfca/dep.c b/drivers/nfc/st21nfca/dep.c index 798a32bbac5d..206285210ab5 100644 --- a/drivers/nfc/st21nfca/dep.c +++ b/drivers/nfc/st21nfca/dep.c @@ -217,7 +217,8 @@ static int st21nfca_tm_recv_atr_req(struct nfc_hci_dev *hdev, atr_req = (struct st21nfca_atr_req *)skb->data; - if (atr_req->length < sizeof(struct st21nfca_atr_req)) { + if (atr_req->length < sizeof(struct st21nfca_atr_req) || + atr_req->length > skb->len) { r = -EPROTO; goto exit; } From 8b36e8c70a9e0d71d015448fefdf3e3bc5eb85e8 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 16:30:47 -0700 Subject: [PATCH 151/546] ANDROID: nfc: fdp: Fix possible buffer overflow in WCS4000 NFC driver Possible buffer overflow when reading next_read_size bytes into tmp buffer after next_read_size was extracted from a previous packet. Bug: 62678828 Signed-off-by: Suren Baghdasaryan --- drivers/nfc/fdp/i2c.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/nfc/fdp/i2c.c b/drivers/nfc/fdp/i2c.c index a5d7332dfce5..2d043415f326 100644 --- a/drivers/nfc/fdp/i2c.c +++ b/drivers/nfc/fdp/i2c.c @@ -177,6 +177,16 @@ static int fdp_nci_i2c_read(struct fdp_i2c_phy *phy, struct sk_buff **skb) /* Packet that contains a length */ if (tmp[0] == 0 && tmp[1] == 0) { phy->next_read_size = (tmp[2] << 8) + tmp[3] + 3; + /* + * Ensure next_read_size does not exceed sizeof(tmp) + * for reading that many bytes during next iteration + */ + if (phy->next_read_size > FDP_NCI_I2C_MAX_PAYLOAD) { + dev_dbg(&client->dev, "%s: corrupted packet\n", + __func__); + phy->next_read_size = 5; + goto flush; + } } else { phy->next_read_size = FDP_NCI_I2C_MIN_PAYLOAD; From 8eddd2bacf12f9f5be871f3e0649107aa42091e1 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 15:50:54 -0700 Subject: [PATCH 152/546] ANDROID: NFC: Fix possible memory corruption when handling SHDLC I-Frame commands When handling SHDLC I-Frame commands "pipe" field used for indexing into an array should be checked before usage. If left unchecked it might access memory outside of the array of size NFC_HCI_MAX_PIPES(127). Bug: 62679701 Signed-off-by: Suren Baghdasaryan --- net/nfc/hci/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/net/nfc/hci/core.c b/net/nfc/hci/core.c index 2b0f0ac498d2..5a58f9f38095 100644 --- a/net/nfc/hci/core.c +++ b/net/nfc/hci/core.c @@ -209,6 +209,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } create_info = (struct hci_create_pipe_resp *)skb->data; + if (create_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + /* Save the new created pipe and bind with local gate, * the description for skb->data[3] is destination gate id * but since we received this cmd from host controller, we @@ -232,6 +237,11 @@ void nfc_hci_cmd_received(struct nfc_hci_dev *hdev, u8 pipe, u8 cmd, } delete_info = (struct hci_delete_pipe_noti *)skb->data; + if (delete_info->pipe >= NFC_HCI_MAX_PIPES) { + status = NFC_HCI_ANY_E_NOK; + goto exit; + } + hdev->pipes[delete_info->pipe].gate = NFC_HCI_INVALID_GATE; hdev->pipes[delete_info->pipe].dest_host = NFC_HCI_INVALID_HOST; break; From 5ff061cb68da492f5196b98e29f0e5d62b03d073 Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Tue, 15 Aug 2017 15:12:24 -0700 Subject: [PATCH 153/546] ANDROID: check dir value of xfrm_userpolicy_id Check user provided dir value to prevent out-of-bound access which may occur if dir is not less than XFRM_POLICY_MAX. (url: http://seclists.org/bugtraq/2017/Jul/30) Bug: 64257838 Signed-off-by: Suren Baghdasaryan Change-Id: I5bbdf95e14a61bdf5207977d9a5a4465bc848da0 --- net/xfrm/xfrm_user.c | 18 ++++++++++++++++++ 1 file changed, 18 insertions(+) diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 7a5a64e70b4d..4c696d4d5ce3 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -1691,6 +1691,10 @@ static struct sk_buff *xfrm_policy_netlink(struct sk_buff *in_skb, struct sk_buff *skb; int err; + err = verify_policy_dir(dir); + if (err) + return ERR_PTR(err); + skb = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL); if (!skb) return ERR_PTR(-ENOMEM); @@ -2216,6 +2220,10 @@ static int xfrm_do_migrate(struct sk_buff *skb, struct nlmsghdr *nlh, int n = 0; struct net *net = sock_net(skb->sk); + err = verify_policy_dir(pi->dir); + if (err) + return err; + if (attrs[XFRMA_MIGRATE] == NULL) return -EINVAL; @@ -2331,6 +2339,11 @@ static int xfrm_send_migrate(const struct xfrm_selector *sel, u8 dir, u8 type, { struct net *net = &init_net; struct sk_buff *skb; + int err; + + err = verify_policy_dir(dir); + if (err) + return err; skb = nlmsg_new(xfrm_migrate_msgsize(num_migrate, !!k), GFP_ATOMIC); if (skb == NULL) @@ -2985,6 +2998,11 @@ static int xfrm_notify_policy_flush(const struct km_event *c) static int xfrm_send_policy_notify(struct xfrm_policy *xp, int dir, const struct km_event *c) { + int err; + + err = verify_policy_dir(dir); + if (err) + return err; switch (c->event) { case XFRM_MSG_NEWPOLICY: From 63123536fe256cd4bc949926c245eb49d32b94bf Mon Sep 17 00:00:00 2001 From: Suren Baghdasaryan Date: Thu, 17 Aug 2017 13:58:40 -0700 Subject: [PATCH 154/546] ANDROID: NFC: st21nfca: Fix memory OOB and leak issues in connectivity events handler Overflow on memcpy is possible in kernel driver for st21nfca's NFC HCI layer when handling connectivity events if aid_len or params_len are bigger than the buffer size. Memory leak is possible when parameter tag is invalid. Bug: 62679581 Signed-off-by: Suren Baghdasaryan --- drivers/nfc/st21nfca/se.c | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/drivers/nfc/st21nfca/se.c b/drivers/nfc/st21nfca/se.c index c79d99b24c96..2d4b6e910b87 100644 --- a/drivers/nfc/st21nfca/se.c +++ b/drivers/nfc/st21nfca/se.c @@ -321,23 +321,33 @@ int st21nfca_connectivity_event_received(struct nfc_hci_dev *hdev, u8 host, * AID 81 5 to 16 * PARAMETERS 82 0 to 255 */ - if (skb->len < NFC_MIN_AID_LENGTH + 2 && + if (skb->len < NFC_MIN_AID_LENGTH + 2 || skb->data[0] != NFC_EVT_TRANSACTION_AID_TAG) return -EPROTO; + /* + * Buffer should have enough space for at least + * two tag fields + two length fields + aid_len (skb->data[1]) + */ + if (skb->len < skb->data[1] + 4) + return -EPROTO; + transaction = (struct nfc_evt_transaction *)devm_kzalloc(dev, skb->len - 2, GFP_KERNEL); transaction->aid_len = skb->data[1]; memcpy(transaction->aid, &skb->data[2], transaction->aid_len); - - /* Check next byte is PARAMETERS tag (82) */ - if (skb->data[transaction->aid_len + 2] != - NFC_EVT_TRANSACTION_PARAMS_TAG) - return -EPROTO; - transaction->params_len = skb->data[transaction->aid_len + 3]; + + /* Check next byte is PARAMETERS tag (82) and the length field */ + if (skb->data[transaction->aid_len + 2] != + NFC_EVT_TRANSACTION_PARAMS_TAG || + skb->len < transaction->aid_len + transaction->params_len + 4) { + devm_kfree(dev, transaction); + return -EPROTO; + } + memcpy(transaction->params, skb->data + transaction->aid_len + 4, transaction->params_len); From 150bc36aba70fdf9b876bd980c4e1da0b96c3afd Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Mon, 28 Aug 2017 09:56:27 -0700 Subject: [PATCH 155/546] UPSTREAM: cpufreq: schedutil: Make iowait boost more energy efficient Currently the iowait_boost feature in schedutil makes the frequency go to max on iowait wakeups. This feature was added to handle a case that Peter described where the throughput of operations involving continuous I/O requests [1] is reduced due to running at a lower frequency, however the lower throughput itself causes utilization to be low and hence causing frequency to be low hence its "stuck". Instead of going to max, its also possible to achieve the same effect by ramping up to max if there are repeated in_iowait wakeups happening. This patch is an attempt to do that. We start from a lower frequency (policy->min) and double the boost for every consecutive iowait update until we reach the maximum iowait boost frequency (iowait_boost_max). I ran a synthetic test (continuous O_DIRECT writes in a loop) on an x86 machine with intel_pstate in passive mode using schedutil. In this test the iowait_boost value ramped from 800MHz to 4GHz in 60ms. The patch achieves the desired improved throughput as the existing behavior. [1] https://patchwork.kernel.org/patch/9735885/ Change-Id: I4a018434a50f4ca29ec15b03465f6dc212e54423 Cc: Srinivas Pandruvada Cc: Len Brown Cc: Rafael J. Wysocki Cc: Viresh Kumar Cc: Ingo Molnar Cc: Peter Zijlstra Suggested-by: Peter Zijlstra Suggested-by: Viresh Kumar Signed-off-by: Joel Fernandes --- kernel/sched/cpufreq_schedutil.c | 38 +++++++++++++++++++++++++++----- 1 file changed, 32 insertions(+), 6 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index e12309c1b07b..f6b95ca15023 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -64,6 +64,7 @@ struct sugov_cpu { struct update_util_data update_util; struct sugov_policy *sg_policy; + bool iowait_boost_pending; unsigned long iowait_boost; unsigned long iowait_boost_max; u64 last_update; @@ -224,30 +225,54 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, unsigned int flags) { if (flags & SCHED_CPUFREQ_IOWAIT) { - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + if (sg_cpu->iowait_boost_pending) + return; + + sg_cpu->iowait_boost_pending = true; + + if (sg_cpu->iowait_boost) { + sg_cpu->iowait_boost <<= 1; + if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) + sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + } else { + sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + } } else if (sg_cpu->iowait_boost) { s64 delta_ns = time - sg_cpu->last_update; /* Clear iowait_boost if the CPU apprears to have been idle. */ - if (delta_ns > TICK_NSEC) + if (delta_ns > TICK_NSEC) { sg_cpu->iowait_boost = 0; + sg_cpu->iowait_boost_pending = false; + } } } static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util = sg_cpu->iowait_boost; - unsigned long boost_max = sg_cpu->iowait_boost_max; + unsigned long boost_util, boost_max; - if (!boost_util) + if (!sg_cpu->iowait_boost) return; + if (sg_cpu->iowait_boost_pending) { + sg_cpu->iowait_boost_pending = false; + } else { + sg_cpu->iowait_boost >>= 1; + if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + sg_cpu->iowait_boost = 0; + return; + } + } + + boost_util = sg_cpu->iowait_boost; + boost_max = sg_cpu->iowait_boost_max; + if (*util * boost_max < *max * boost_util) { *util = boost_util; *max = boost_max; } - sg_cpu->iowait_boost >>= 1; } #ifdef CONFIG_NO_HZ_COMMON @@ -320,6 +345,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) delta_ns = last_freq_update_time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; + j_sg_cpu->iowait_boost_pending = false; continue; } if (j_sg_cpu->flags & SCHED_CPUFREQ_DL) From 19a919c4cf35a193f4740a25d08dd844f2048c16 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Sun, 23 Jul 2017 08:54:26 -0700 Subject: [PATCH 156/546] UPSTREAM: cpufreq: schedutil: Use unsigned int for iowait boost Make iowait_boost and iowait_boost_max as unsigned int since its unit is kHz and this is consistent with struct cpufreq_policy. Also change the local variables in sugov_iowait_boost to match this. Change-Id: I6c67ed94c57c4bdb24bada4b97045593fcb95d2e Cc: Srinivas Pandruvada Cc: Len Brown Cc: Rafael J. Wysocki Cc: Viresh Kumar Cc: Ingo Molnar Cc: Peter Zijlstra Signed-off-by: Joel Fernandes --- kernel/sched/cpufreq_schedutil.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index f6b95ca15023..a0bcf488fb81 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -65,8 +65,8 @@ struct sugov_cpu { struct sugov_policy *sg_policy; bool iowait_boost_pending; - unsigned long iowait_boost; - unsigned long iowait_boost_max; + unsigned int iowait_boost; + unsigned int iowait_boost_max; u64 last_update; /* The fields below are only needed when sharing a policy. */ @@ -251,7 +251,7 @@ static void sugov_set_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, unsigned long *util, unsigned long *max) { - unsigned long boost_util, boost_max; + unsigned int boost_util, boost_max; if (!sg_cpu->iowait_boost) return; From c812848dc3b00b2fc046d2c37e01cf5517c3df1d Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Tue, 29 Aug 2017 10:36:40 +0530 Subject: [PATCH 157/546] android: android-base.config: enable IP6_NF_MATCH_RPFILTER USB tethering has a hard dependency on ip6t_rpfilter module now. So enable IP6_NF_MATCH_RPFILTER config to make it work again, otherwise we run into following failures when we try to enable USB tethering on Hikey: W IptablesRestoreController: iptables-restore process 1893 terminated status=256 E IptablesRestoreController: iptables error: E IptablesRestoreController: ------- COMMAND ------- E IptablesRestoreController: *raw E IptablesRestoreController: -A natctrl_raw_PREROUTING -i usb0 -m rpfilter --invert ! -s fe80::/64 -j DROP E IptablesRestoreController: COMMIT E IptablesRestoreController: E IptablesRestoreController: ------- ERROR ------- E IptablesRestoreController: ip6tables-restore: line 319 failed E IptablesRestoreController: ---------------------- E NatController: Error setting forward rules E Tethering: [usb0] ERROR Exception enabling NAT: java.lang.IllegalStateException: command '55 nat enable usb0 wlan0 1 192.168.42.0/24' failed with '400 55 Nat operation failed (No such device)' Change-Id: I4a4e192ee1c81a7a425eefee9a2a53dd41e1fa0e Signed-off-by: Amit Pundir --- android/configs/android-base.cfg | 1 + 1 file changed, 1 insertion(+) diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg index 48b2cdbe8d49..7199561cd42b 100644 --- a/android/configs/android-base.cfg +++ b/android/configs/android-base.cfg @@ -36,6 +36,7 @@ CONFIG_INET_XFRM_MODE_TUNNEL=y CONFIG_IP6_NF_FILTER=y CONFIG_IP6_NF_IPTABLES=y CONFIG_IP6_NF_MANGLE=y +CONFIG_IP6_NF_MATCH_RPFILTER=y CONFIG_IP6_NF_RAW=y CONFIG_IP6_NF_TARGET_REJECT=y CONFIG_IPV6=y From 4e6b2d54f8f202fe5acc9565a8806e97907a0aff Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 30 Jun 2017 10:22:23 -0700 Subject: [PATCH 158/546] FROMLIST: android: binder: Refactor prev and next buffer into a helper function (from https://patchwork.kernel.org/patch/9928605/) Use helper functions buffer_next and buffer_prev instead of list_entry to get the next and previous buffers. Bug: 36007193 Change-Id: I422dce84afde3d2138a6d976593b109a9cc49003 Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index aabfebac6e57..134c649e5a24 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -48,14 +48,23 @@ module_param_named(debug_mask, binder_alloc_debug_mask, pr_info(x); \ } while (0) +static struct binder_buffer *binder_buffer_next(struct binder_buffer *buffer) +{ + return list_entry(buffer->entry.next, struct binder_buffer, entry); +} + +static struct binder_buffer *binder_buffer_prev(struct binder_buffer *buffer) +{ + return list_entry(buffer->entry.prev, struct binder_buffer, entry); +} + static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, struct binder_buffer *buffer) { if (list_is_last(&buffer->entry, &alloc->buffers)) return alloc->buffer + alloc->buffer_size - (void *)buffer->data; - return (size_t)list_entry(buffer->entry.next, - struct binder_buffer, entry) - (size_t)buffer->data; + return (size_t)binder_buffer_next(buffer) - (size_t)buffer->data; } static void binder_insert_free_buffer(struct binder_alloc *alloc, @@ -470,7 +479,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, int free_page_start = 1; BUG_ON(alloc->buffers.next == &buffer->entry); - prev = list_entry(buffer->entry.prev, struct binder_buffer, entry); + prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); if (buffer_end_page(prev) == buffer_start_page(buffer)) { free_page_start = 0; @@ -482,8 +491,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, } if (!list_is_last(&buffer->entry, &alloc->buffers)) { - next = list_entry(buffer->entry.next, - struct binder_buffer, entry); + next = binder_buffer_next(buffer); if (buffer_start_page(next) == buffer_end_page(buffer)) { free_page_end = 0; if (buffer_start_page(next) == @@ -544,8 +552,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, rb_erase(&buffer->rb_node, &alloc->allocated_buffers); buffer->free = 1; if (!list_is_last(&buffer->entry, &alloc->buffers)) { - struct binder_buffer *next = list_entry(buffer->entry.next, - struct binder_buffer, entry); + struct binder_buffer *next = binder_buffer_next(buffer); if (next->free) { rb_erase(&next->rb_node, &alloc->free_buffers); @@ -553,8 +560,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, } } if (alloc->buffers.next != &buffer->entry) { - struct binder_buffer *prev = list_entry(buffer->entry.prev, - struct binder_buffer, entry); + struct binder_buffer *prev = binder_buffer_prev(buffer); if (prev->free) { binder_delete_free_buffer(alloc, buffer); From 9fd3fe1b4513561a25d17ed2e3c745fa10f7c203 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 22 Jun 2017 14:37:45 -0700 Subject: [PATCH 159/546] FROMLIST: android: binder: Add allocator selftest (from https://patchwork.kernel.org/patch/9928609/) binder_alloc_selftest tests that alloc_new_buf handles page allocation and deallocation properly when allocate and free buffers. The test allocates 5 buffers of various sizes to cover all possible page alignment cases, and frees the buffers using a list of exhaustive freeing order. Test: boot the device with ANDROID_BINDER_IPC_SELFTEST config option enabled. Allocator selftest passes. Bug: 36007193 Change-Id: I2fe396232b7dfe4bbc50bdba99ca0de9be63cc37 Signed-off-by: Sherry Yang --- drivers/android/Kconfig | 10 + drivers/android/Makefile | 1 + drivers/android/binder.c | 2 + drivers/android/binder_alloc.h | 5 + drivers/android/binder_alloc_selftest.c | 271 ++++++++++++++++++++++++ 5 files changed, 289 insertions(+) create mode 100644 drivers/android/binder_alloc_selftest.c diff --git a/drivers/android/Kconfig b/drivers/android/Kconfig index 4d4cdc1a6e25..01de42c8b74b 100644 --- a/drivers/android/Kconfig +++ b/drivers/android/Kconfig @@ -44,6 +44,16 @@ config ANDROID_BINDER_IPC_32BIT Note that enabling this will break newer Android user-space. +config ANDROID_BINDER_IPC_SELFTEST + bool "Android Binder IPC Driver Selftest" + depends on ANDROID_BINDER_IPC + ---help--- + This feature allows binder selftest to run. + + Binder selftest checks the allocation and free of binder buffers + exhaustively with combinations of various buffer sizes and + alignments. + endif # if ANDROID endmenu diff --git a/drivers/android/Makefile b/drivers/android/Makefile index 4b7c726bb560..a01254c43ee3 100644 --- a/drivers/android/Makefile +++ b/drivers/android/Makefile @@ -1,3 +1,4 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_ANDROID_BINDER_IPC) += binder.o binder_alloc.o +obj-$(CONFIG_ANDROID_BINDER_IPC_SELFTEST) += binder_alloc_selftest.o diff --git a/drivers/android/binder.c b/drivers/android/binder.c index bfdd52ea0d1c..75cbbd2e986a 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -4580,6 +4580,8 @@ static long binder_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) /*pr_info("binder_ioctl: %d:%d %x %lx\n", proc->pid, current->pid, cmd, arg);*/ + binder_selftest_alloc(&proc->alloc); + trace_binder_ioctl(cmd, arg); ret = wait_event_interruptible(binder_user_error_wait, binder_stop_on_user_error < 2); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 088e4ffc6230..4f02cc084c15 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -102,6 +102,11 @@ struct binder_alloc { int pid; }; +#ifdef CONFIG_ANDROID_BINDER_IPC_SELFTEST +void binder_selftest_alloc(struct binder_alloc *alloc); +#else +static inline void binder_selftest_alloc(struct binder_alloc *alloc) {} +#endif extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t data_size, size_t offsets_size, diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c new file mode 100644 index 000000000000..cc00ab6ee29d --- /dev/null +++ b/drivers/android/binder_alloc_selftest.c @@ -0,0 +1,271 @@ +/* binder_alloc_selftest.c + * + * Android IPC Subsystem + * + * Copyright (C) 2017 Google, Inc. + * + * This software is licensed under the terms of the GNU General Public + * License version 2, as published by the Free Software Foundation, and + * may be copied, distributed, and modified under those terms. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include "binder_alloc.h" + +#define BUFFER_NUM 5 +#define BUFFER_MIN_SIZE (PAGE_SIZE / 8) + +static bool binder_selftest_run = true; +static int binder_selftest_failures; +static DEFINE_MUTEX(binder_selftest_lock); + +/** + * enum buf_end_align_type - Page alignment of a buffer + * end with regard to the end of the previous buffer. + * + * In the pictures below, buf2 refers to the buffer we + * are aligning. buf1 refers to previous buffer by addr. + * Symbol [ means the start of a buffer, ] means the end + * of a buffer, and | means page boundaries. + */ +enum buf_end_align_type { + /** + * @SAME_PAGE_UNALIGNED: The end of this buffer is on + * the same page as the end of the previous buffer and + * is not page aligned. Examples: + * buf1 ][ buf2 ][ ... + * buf1 ]|[ buf2 ][ ... + */ + SAME_PAGE_UNALIGNED = 0, + /** + * @SAME_PAGE_ALIGNED: When the end of the previous buffer + * is not page aligned, the end of this buffer is on the + * same page as the end of the previous buffer and is page + * aligned. When the previous buffer is page aligned, the + * end of this buffer is aligned to the next page boundary. + * Examples: + * buf1 ][ buf2 ]| ... + * buf1 ]|[ buf2 ]| ... + */ + SAME_PAGE_ALIGNED, + /** + * @NEXT_PAGE_UNALIGNED: The end of this buffer is on + * the page next to the end of the previous buffer and + * is not page aligned. Examples: + * buf1 ][ buf2 | buf2 ][ ... + * buf1 ]|[ buf2 | buf2 ][ ... + */ + NEXT_PAGE_UNALIGNED, + /** + * @NEXT_PAGE_ALIGNED: The end of this buffer is on + * the page next to the end of the previous buffer and + * is page aligned. Examples: + * buf1 ][ buf2 | buf2 ]| ... + * buf1 ]|[ buf2 | buf2 ]| ... + */ + NEXT_PAGE_ALIGNED, + /** + * @NEXT_NEXT_UNALIGNED: The end of this buffer is on + * the page that follows the page after the end of the + * previous buffer and is not page aligned. Examples: + * buf1 ][ buf2 | buf2 | buf2 ][ ... + * buf1 ]|[ buf2 | buf2 | buf2 ][ ... + */ + NEXT_NEXT_UNALIGNED, + LOOP_END, +}; + +static void pr_err_size_seq(size_t *sizes, int *seq) +{ + int i; + + pr_err("alloc sizes: "); + for (i = 0; i < BUFFER_NUM; i++) + pr_cont("[%zu]", sizes[i]); + pr_cont("\n"); + pr_err("free seq: "); + for (i = 0; i < BUFFER_NUM; i++) + pr_cont("[%d]", seq[i]); + pr_cont("\n"); +} + +static bool check_buffer_pages_allocated(struct binder_alloc *alloc, + struct binder_buffer *buffer, + size_t size) +{ + void *page_addr, *end; + int page_index; + + end = (void *)PAGE_ALIGN((uintptr_t)buffer + size); + for (page_addr = buffer; page_addr < end; page_addr += PAGE_SIZE) { + page_index = (page_addr - alloc->buffer) / PAGE_SIZE; + if (!alloc->pages[page_index]) { + pr_err("incorrect alloc state at page index %d\n", + page_index); + return false; + } + } + return true; +} + +static void binder_selftest_alloc_buf(struct binder_alloc *alloc, + struct binder_buffer *buffers[], + size_t *sizes, int *seq) +{ + int i; + + for (i = 0; i < BUFFER_NUM; i++) { + buffers[i] = binder_alloc_new_buf(alloc, sizes[i], 0, 0, 0); + if (IS_ERR(buffers[i]) || + !check_buffer_pages_allocated(alloc, buffers[i], + sizes[i])) { + pr_err_size_seq(sizes, seq); + binder_selftest_failures++; + } + } +} + +static void binder_selftest_free_buf(struct binder_alloc *alloc, + struct binder_buffer *buffers[], + size_t *sizes, int *seq) +{ + int i; + + for (i = 0; i < BUFFER_NUM; i++) + binder_alloc_free_buf(alloc, buffers[seq[i]]); + + for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) { + if ((!alloc->pages[i]) == (i == 0)) { + pr_err("incorrect free state at page index %d\n", i); + binder_selftest_failures++; + } + } +} + +static void binder_selftest_alloc_free(struct binder_alloc *alloc, + size_t *sizes, int *seq) +{ + struct binder_buffer *buffers[BUFFER_NUM]; + + binder_selftest_alloc_buf(alloc, buffers, sizes, seq); + binder_selftest_free_buf(alloc, buffers, sizes, seq); +} + +static bool is_dup(int *seq, int index, int val) +{ + int i; + + for (i = 0; i < index; i++) { + if (seq[i] == val) + return true; + } + return false; +} + +/* Generate BUFFER_NUM factorial free orders. */ +static void binder_selftest_free_seq(struct binder_alloc *alloc, + size_t *sizes, int *seq, int index) +{ + int i; + + if (index == BUFFER_NUM) { + binder_selftest_alloc_free(alloc, sizes, seq); + return; + } + for (i = 0; i < BUFFER_NUM; i++) { + if (is_dup(seq, index, i)) + continue; + seq[index] = i; + binder_selftest_free_seq(alloc, sizes, seq, index + 1); + } +} + +static void binder_selftest_alloc_size(struct binder_alloc *alloc, + size_t *end_offset) +{ + int i; + int seq[BUFFER_NUM] = {0}; + size_t front_sizes[BUFFER_NUM]; + size_t back_sizes[BUFFER_NUM]; + size_t last_offset, offset = 0; + + for (i = 0; i < BUFFER_NUM; i++) { + last_offset = offset; + offset = end_offset[i]; + front_sizes[i] = offset - last_offset; + back_sizes[BUFFER_NUM - i - 1] = front_sizes[i]; + } + /* + * Buffers share the first or last few pages. + * Only BUFFER_NUM - 1 buffer sizes are adjustable since + * we need one giant buffer before getting to the last page. + */ + back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1] + - sizeof(struct binder_buffer) * BUFFER_NUM; + binder_selftest_free_seq(alloc, front_sizes, seq, 0); + binder_selftest_free_seq(alloc, back_sizes, seq, 0); +} + +static void binder_selftest_alloc_offset(struct binder_alloc *alloc, + size_t *end_offset, int index) +{ + int align; + size_t end, prev; + + if (index == BUFFER_NUM) { + binder_selftest_alloc_size(alloc, end_offset); + return; + } + prev = index == 0 ? 0 : end_offset[index - 1]; + end = prev; + + BUILD_BUG_ON((BUFFER_MIN_SIZE + sizeof(struct binder_buffer)) + * BUFFER_NUM >= PAGE_SIZE); + + for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) { + if (align % 2) + end = ALIGN(end, PAGE_SIZE); + else + end += BUFFER_MIN_SIZE; + end_offset[index] = end; + binder_selftest_alloc_offset(alloc, end_offset, index + 1); + } +} + +/** + * binder_selftest_alloc() - Test alloc and free of buffer pages. + * @alloc: Pointer to alloc struct. + * + * Allocate BUFFER_NUM buffers to cover all page alignment cases, + * then free them in all orders possible. Check that pages are + * allocated after buffer alloc and freed after freeing buffer. + */ +void binder_selftest_alloc(struct binder_alloc *alloc) +{ + size_t end_offset[BUFFER_NUM]; + + if (!binder_selftest_run) + return; + mutex_lock(&binder_selftest_lock); + if (!binder_selftest_run || !alloc->vma) + goto done; + pr_info("STARTED\n"); + binder_selftest_alloc_offset(alloc, end_offset, 0); + binder_selftest_run = false; + if (binder_selftest_failures > 0) + pr_info("%d tests FAILED\n", binder_selftest_failures); + else + pr_info("PASSED\n"); + +done: + mutex_unlock(&binder_selftest_lock); +} From c500f35cf5cc3e0f106a1df9a64e8971c9caa8ae Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Thu, 3 Aug 2017 11:33:53 -0700 Subject: [PATCH 160/546] FROMLIST: android: binder: Move buffer out of area shared with user space (from https://patchwork.kernel.org/patch/9928607/) Binder driver allocates buffer meta data in a region that is mapped in user space. These meta data contain pointers in the kernel. This patch allocates buffer meta data on the kernel heap that is not mapped in user space, and uses a pointer to refer to the data mapped. Also move alloc->buffers initialization from mmap to init since it's now used even when mmap failed or was not called. Bug: 36007193 Change-Id: Id5136048bdb7b796f59de066de7ea7df410498f5 Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 146 ++++++++++++++---------- drivers/android/binder_alloc.h | 2 +- drivers/android/binder_alloc_selftest.c | 11 +- 3 files changed, 91 insertions(+), 68 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 134c649e5a24..2ca2b02f82bb 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -62,9 +62,9 @@ static size_t binder_alloc_buffer_size(struct binder_alloc *alloc, struct binder_buffer *buffer) { if (list_is_last(&buffer->entry, &alloc->buffers)) - return alloc->buffer + - alloc->buffer_size - (void *)buffer->data; - return (size_t)binder_buffer_next(buffer) - (size_t)buffer->data; + return (u8 *)alloc->buffer + + alloc->buffer_size - (u8 *)buffer->data; + return (u8 *)binder_buffer_next(buffer)->data - (u8 *)buffer->data; } static void binder_insert_free_buffer(struct binder_alloc *alloc, @@ -114,9 +114,9 @@ static void binder_insert_allocated_buffer_locked( buffer = rb_entry(parent, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (new_buffer < buffer) + if (new_buffer->data < buffer->data) p = &parent->rb_left; - else if (new_buffer > buffer) + else if (new_buffer->data > buffer->data) p = &parent->rb_right; else BUG(); @@ -131,18 +131,17 @@ static struct binder_buffer *binder_alloc_prepare_to_free_locked( { struct rb_node *n = alloc->allocated_buffers.rb_node; struct binder_buffer *buffer; - struct binder_buffer *kern_ptr; + void *kern_ptr; - kern_ptr = (struct binder_buffer *)(user_ptr - alloc->user_buffer_offset - - offsetof(struct binder_buffer, data)); + kern_ptr = (void *)(user_ptr - alloc->user_buffer_offset); while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(buffer->free); - if (kern_ptr < buffer) + if (kern_ptr < buffer->data) n = n->rb_left; - else if (kern_ptr > buffer) + else if (kern_ptr > buffer->data) n = n->rb_right; else { /* @@ -330,6 +329,9 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, return ERR_PTR(-ENOSPC); } + /* Pad 0-size buffers so they get assigned unique addresses */ + size = max(size, sizeof(void *)); + while (n) { buffer = rb_entry(n, struct binder_buffer, rb_node); BUG_ON(!buffer->free); @@ -389,14 +391,9 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, has_page_addr = (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK); - if (n == NULL) { - if (size + sizeof(struct binder_buffer) + 4 >= buffer_size) - buffer_size = size; /* no room for other buffers */ - else - buffer_size = size + sizeof(struct binder_buffer); - } + WARN_ON(n && buffer_size != size); end_page_addr = - (void *)PAGE_ALIGN((uintptr_t)buffer->data + buffer_size); + (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; ret = binder_update_page_range(alloc, 1, @@ -404,17 +401,25 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (ret) return ERR_PTR(ret); - rb_erase(best_fit, &alloc->free_buffers); - buffer->free = 0; - buffer->free_in_progress = 0; - binder_insert_allocated_buffer_locked(alloc, buffer); if (buffer_size != size) { - struct binder_buffer *new_buffer = (void *)buffer->data + size; + struct binder_buffer *new_buffer; + new_buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); + if (!new_buffer) { + pr_err("%s: %d failed to alloc new buffer struct\n", + __func__, alloc->pid); + goto err_alloc_buf_struct_failed; + } + new_buffer->data = (u8 *)buffer->data + size; list_add(&new_buffer->entry, &buffer->entry); new_buffer->free = 1; binder_insert_free_buffer(alloc, new_buffer); } + + rb_erase(best_fit, &alloc->free_buffers); + buffer->free = 0; + buffer->free_in_progress = 0; + binder_insert_allocated_buffer_locked(alloc, buffer); binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: binder_alloc_buf size %zd got %pK\n", alloc->pid, size, buffer); @@ -429,6 +434,12 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, alloc->pid, size, alloc->free_async_space); } return buffer; + +err_alloc_buf_struct_failed: + binder_update_page_range(alloc, 0, + (void *)PAGE_ALIGN((uintptr_t)buffer->data), + end_page_addr, NULL); + return ERR_PTR(-ENOMEM); } /** @@ -463,56 +474,59 @@ struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, static void *buffer_start_page(struct binder_buffer *buffer) { - return (void *)((uintptr_t)buffer & PAGE_MASK); + return (void *)((uintptr_t)buffer->data & PAGE_MASK); } -static void *buffer_end_page(struct binder_buffer *buffer) +static void *prev_buffer_end_page(struct binder_buffer *buffer) { - return (void *)(((uintptr_t)(buffer + 1) - 1) & PAGE_MASK); + return (void *)(((uintptr_t)(buffer->data) - 1) & PAGE_MASK); } static void binder_delete_free_buffer(struct binder_alloc *alloc, struct binder_buffer *buffer) { struct binder_buffer *prev, *next = NULL; - int free_page_end = 1; - int free_page_start = 1; - + bool to_free = true; BUG_ON(alloc->buffers.next == &buffer->entry); prev = binder_buffer_prev(buffer); BUG_ON(!prev->free); - if (buffer_end_page(prev) == buffer_start_page(buffer)) { - free_page_start = 0; - if (buffer_end_page(prev) == buffer_end_page(buffer)) - free_page_end = 0; + if (prev_buffer_end_page(prev) == buffer_start_page(buffer)) { + to_free = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer, prev); + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, buffer->data, prev->data); } if (!list_is_last(&buffer->entry, &alloc->buffers)) { next = binder_buffer_next(buffer); - if (buffer_start_page(next) == buffer_end_page(buffer)) { - free_page_end = 0; - if (buffer_start_page(next) == - buffer_start_page(buffer)) - free_page_start = 0; + if (buffer_start_page(next) == buffer_start_page(buffer)) { + to_free = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK share page with %pK\n", - alloc->pid, buffer, prev); + "%d: merge free, buffer %pK share page with %pK\n", + alloc->pid, + buffer->data, + next->data); } } - list_del(&buffer->entry); - if (free_page_start || free_page_end) { + + if (PAGE_ALIGNED(buffer->data)) { binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%d: merge free, buffer %pK do not share page%s%s with %pK or %pK\n", - alloc->pid, buffer, free_page_start ? "" : " end", - free_page_end ? "" : " start", prev, next); - binder_update_page_range(alloc, 0, free_page_start ? - buffer_start_page(buffer) : buffer_end_page(buffer), - (free_page_end ? buffer_end_page(buffer) : - buffer_start_page(buffer)) + PAGE_SIZE, NULL); + "%d: merge free, buffer start %pK is page aligned\n", + alloc->pid, buffer->data); + to_free = false; } + + if (to_free) { + binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, + "%d: merge free, buffer %pK do not share page with %pK or %pK\n", + alloc->pid, buffer->data, + prev->data, next->data); + binder_update_page_range(alloc, 0, buffer_start_page(buffer), + buffer_start_page(buffer) + PAGE_SIZE, + NULL); + } + list_del(&buffer->entry); + kfree(buffer); } static void binder_free_buf_locked(struct binder_alloc *alloc, @@ -533,8 +547,8 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, BUG_ON(buffer->free); BUG_ON(size > buffer_size); BUG_ON(buffer->transaction != NULL); - BUG_ON((void *)buffer < alloc->buffer); - BUG_ON((void *)buffer > alloc->buffer + alloc->buffer_size); + BUG_ON(buffer->data < alloc->buffer); + BUG_ON(buffer->data > alloc->buffer + alloc->buffer_size); if (buffer->async_transaction) { alloc->free_async_space += size + sizeof(struct binder_buffer); @@ -646,14 +660,14 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, } alloc->buffer_size = vma->vm_end - vma->vm_start; - if (binder_update_page_range(alloc, 1, alloc->buffer, - alloc->buffer + PAGE_SIZE, vma)) { + buffer = kzalloc(sizeof(*buffer), GFP_KERNEL); + if (!buffer) { ret = -ENOMEM; - failure_string = "alloc small buf"; - goto err_alloc_small_buf_failed; + failure_string = "alloc buffer struct"; + goto err_alloc_buf_struct_failed; } - buffer = alloc->buffer; - INIT_LIST_HEAD(&alloc->buffers); + + buffer->data = alloc->buffer; list_add(&buffer->entry, &alloc->buffers); buffer->free = 1; binder_insert_free_buffer(alloc, buffer); @@ -664,7 +678,7 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, return 0; -err_alloc_small_buf_failed: +err_alloc_buf_struct_failed: kfree(alloc->pages); alloc->pages = NULL; err_alloc_pages_failed: @@ -684,14 +698,13 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) { struct rb_node *n; int buffers, page_count; + struct binder_buffer *buffer; BUG_ON(alloc->vma); buffers = 0; mutex_lock(&alloc->mutex); while ((n = rb_first(&alloc->allocated_buffers))) { - struct binder_buffer *buffer; - buffer = rb_entry(n, struct binder_buffer, rb_node); /* Transaction should already have been freed */ @@ -701,6 +714,16 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) buffers++; } + while (!list_empty(&alloc->buffers)) { + buffer = list_first_entry(&alloc->buffers, + struct binder_buffer, entry); + WARN_ON(!buffer->free); + + list_del(&buffer->entry); + WARN_ON_ONCE(!list_empty(&alloc->buffers)); + kfree(buffer); + } + page_count = 0; if (alloc->pages) { int i; @@ -804,5 +827,6 @@ void binder_alloc_init(struct binder_alloc *alloc) alloc->tsk = current->group_leader; alloc->pid = current->group_leader->pid; mutex_init(&alloc->mutex); + INIT_LIST_HEAD(&alloc->buffers); } diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index 4f02cc084c15..dd5649bf6469 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -57,7 +57,7 @@ struct binder_buffer { size_t data_size; size_t offsets_size; size_t extra_buffers_size; - uint8_t data[0]; + void *data; }; /** diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index cc00ab6ee29d..0bf72079a9da 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -105,8 +105,9 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc, void *page_addr, *end; int page_index; - end = (void *)PAGE_ALIGN((uintptr_t)buffer + size); - for (page_addr = buffer; page_addr < end; page_addr += PAGE_SIZE) { + end = (void *)PAGE_ALIGN((uintptr_t)buffer->data + size); + page_addr = buffer->data; + for (; page_addr < end; page_addr += PAGE_SIZE) { page_index = (page_addr - alloc->buffer) / PAGE_SIZE; if (!alloc->pages[page_index]) { pr_err("incorrect alloc state at page index %d\n", @@ -209,8 +210,7 @@ static void binder_selftest_alloc_size(struct binder_alloc *alloc, * Only BUFFER_NUM - 1 buffer sizes are adjustable since * we need one giant buffer before getting to the last page. */ - back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1] - - sizeof(struct binder_buffer) * BUFFER_NUM; + back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1]; binder_selftest_free_seq(alloc, front_sizes, seq, 0); binder_selftest_free_seq(alloc, back_sizes, seq, 0); } @@ -228,8 +228,7 @@ static void binder_selftest_alloc_offset(struct binder_alloc *alloc, prev = index == 0 ? 0 : end_offset[index - 1]; end = prev; - BUILD_BUG_ON((BUFFER_MIN_SIZE + sizeof(struct binder_buffer)) - * BUFFER_NUM >= PAGE_SIZE); + BUILD_BUG_ON(BUFFER_MIN_SIZE * BUFFER_NUM >= PAGE_SIZE); for (align = SAME_PAGE_UNALIGNED; align < LOOP_END; align++) { if (align % 2) From 652309276d7c2f75bffcde4b486f1f911d7af82d Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Sat, 29 Jul 2017 13:24:11 -0700 Subject: [PATCH 161/546] FROMLIST: android: binder: Add global lru shrinker to binder (from https://patchwork.kernel.org/patch/9928615/) Hold on to the pages allocated and mapped for transaction buffers until the system is under memory pressure. When that happens, use linux shrinker to free pages. Without using shrinker, patch "android: binder: Move buffer out of area shared with user space" will cause a significant slow down for small transactions that fit into the first page because free list buffer header used to be inlined with buffer data. In addition to prevent the performance regression for small transactions, this patch improves the performance for transactions that take up more than one page. Modify alloc selftest to work with the shrinker change. Test: Run memory intensive applications (Chrome and Camera) to trigger shrinker callbacks. Binder frees memory as expected. Test: Run binderThroughputTest with high memory pressure option enabled. Bug: 63926541 Change-Id: I3abfc43b405e7e0a6228da37e0689a4b944f0e00 Signed-off-by: Sherry Yang --- drivers/android/binder.c | 2 + drivers/android/binder_alloc.c | 173 ++++++++++++++++++++---- drivers/android/binder_alloc.h | 23 +++- drivers/android/binder_alloc_selftest.c | 68 ++++++++-- 4 files changed, 226 insertions(+), 40 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 75cbbd2e986a..3ce96db8e5d4 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5623,6 +5623,8 @@ static int __init binder_init(void) struct binder_device *device; struct hlist_node *tmp; + binder_alloc_shrinker_init(); + atomic_set(&binder_transaction_log.cur, ~0U); atomic_set(&binder_transaction_log_failed.cur, ~0U); binder_deferred_workqueue = create_singlethread_workqueue("binder"); diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 2ca2b02f82bb..bcdaf684a658 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -27,9 +27,12 @@ #include #include #include +#include #include "binder_alloc.h" #include "binder_trace.h" +struct list_lru binder_alloc_lru; + static DEFINE_MUTEX(binder_alloc_mmap_lock); enum { @@ -188,8 +191,9 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, { void *page_addr; unsigned long user_page_addr; - struct page **page; - struct mm_struct *mm; + struct binder_lru_page *page; + struct mm_struct *mm = NULL; + bool need_mm = false; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, "%d: %s pages %pK-%pK\n", alloc->pid, @@ -200,9 +204,18 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, trace_binder_update_page_range(alloc, allocate, start, end); - if (vma) - mm = NULL; - else + if (allocate == 0) + goto free_range; + + for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + if (!page->page_ptr) { + need_mm = true; + break; + } + } + + if (!vma && need_mm) mm = get_task_mm(alloc->tsk); if (mm) { @@ -215,10 +228,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (allocate == 0) - goto free_range; - - if (vma == NULL) { + if (!vma && need_mm) { pr_err("%d: binder_alloc_buf failed to map pages in userspace, no vma\n", alloc->pid); goto err_no_vma; @@ -226,18 +236,33 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; + bool on_lru; page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; - BUG_ON(*page); - *page = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); - if (*page == NULL) { + if (page->page_ptr) { + on_lru = list_lru_del(&binder_alloc_lru, &page->lru); + WARN_ON(!on_lru); + continue; + } + + if (WARN_ON(!vma)) + goto err_page_ptr_cleared; + + page->page_ptr = alloc_page(GFP_KERNEL | + __GFP_HIGHMEM | + __GFP_ZERO); + if (!page->page_ptr) { pr_err("%d: binder_alloc_buf failed for page at %pK\n", alloc->pid, page_addr); goto err_alloc_page_failed; } + page->alloc = alloc; + INIT_LIST_HEAD(&page->lru); + ret = map_kernel_range_noflush((unsigned long)page_addr, - PAGE_SIZE, PAGE_KERNEL, page); + PAGE_SIZE, PAGE_KERNEL, + &page->page_ptr); flush_cache_vmap((unsigned long)page_addr, (unsigned long)page_addr + PAGE_SIZE); if (ret != 1) { @@ -247,7 +272,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } user_page_addr = (uintptr_t)page_addr + alloc->user_buffer_offset; - ret = vm_insert_page(vma, user_page_addr, page[0]); + ret = vm_insert_page(vma, user_page_addr, page[0].page_ptr); if (ret) { pr_err("%d: binder_alloc_buf failed to map page at %lx in userspace\n", alloc->pid, user_page_addr); @@ -264,16 +289,21 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, free_range: for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { + bool ret; + page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; - if (vma) - zap_page_range(vma, (uintptr_t)page_addr + - alloc->user_buffer_offset, PAGE_SIZE, NULL); + + ret = list_lru_add(&binder_alloc_lru, &page->lru); + WARN_ON(!ret); + continue; + err_vm_insert_page_failed: unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); err_map_kernel_failed: - __free_page(*page); - *page = NULL; + __free_page(page->page_ptr); + page->page_ptr = NULL; err_alloc_page_failed: +err_page_ptr_cleared: ; } err_no_vma: @@ -730,16 +760,20 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { void *page_addr; + bool on_lru; - if (!alloc->pages[i]) + if (!alloc->pages[i].page_ptr) continue; + on_lru = list_lru_del(&binder_alloc_lru, + &alloc->pages[i].lru); page_addr = alloc->buffer + i * PAGE_SIZE; binder_alloc_debug(BINDER_DEBUG_BUFFER_ALLOC, - "%s: %d: page %d at %pK not freed\n", - __func__, alloc->pid, i, page_addr); + "%s: %d: page %d at %pK %s\n", + __func__, alloc->pid, i, page_addr, + on_lru ? "on lru" : "active"); unmap_kernel_range((unsigned long)page_addr, PAGE_SIZE); - __free_page(alloc->pages[i]); + __free_page(alloc->pages[i].page_ptr); page_count++; } kfree(alloc->pages); @@ -815,6 +849,94 @@ void binder_alloc_vma_close(struct binder_alloc *alloc) WRITE_ONCE(alloc->vma_vm_mm, NULL); } +/** + * binder_alloc_free_page() - shrinker callback to free pages + * @item: item to free + * @lock: lock protecting the item + * @cb_arg: callback argument + * + * Called from list_lru_walk() in binder_shrink_scan() to free + * up pages when the system is under memory pressure. + */ +enum lru_status binder_alloc_free_page(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lock, + void *cb_arg) +{ + struct mm_struct *mm = NULL; + struct binder_lru_page *page = container_of(item, + struct binder_lru_page, + lru); + struct binder_alloc *alloc; + uintptr_t page_addr; + size_t index; + + alloc = page->alloc; + if (!mutex_trylock(&alloc->mutex)) + goto err_get_alloc_mutex_failed; + + if (!page->page_ptr) + goto err_page_already_freed; + + index = page - alloc->pages; + page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; + if (alloc->vma) { + mm = get_task_mm(alloc->tsk); + if (!mm) + goto err_get_task_mm_failed; + if (!down_write_trylock(&mm->mmap_sem)) + goto err_down_write_mmap_sem_failed; + + zap_page_range(alloc->vma, + page_addr + + alloc->user_buffer_offset, + PAGE_SIZE, NULL); + + up_write(&mm->mmap_sem); + mmput(mm); + } + + unmap_kernel_range(page_addr, PAGE_SIZE); + __free_page(page->page_ptr); + page->page_ptr = NULL; + + list_lru_isolate(lru, item); + + mutex_unlock(&alloc->mutex); + return LRU_REMOVED; + +err_down_write_mmap_sem_failed: + mmput(mm); +err_get_task_mm_failed: +err_page_already_freed: + mutex_unlock(&alloc->mutex); +err_get_alloc_mutex_failed: + return LRU_SKIP; +} + +static unsigned long +binder_shrink_count(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long ret = list_lru_count(&binder_alloc_lru); + return ret; +} + +static unsigned long +binder_shrink_scan(struct shrinker *shrink, struct shrink_control *sc) +{ + unsigned long ret; + + ret = list_lru_walk(&binder_alloc_lru, binder_alloc_free_page, + NULL, sc->nr_to_scan); + return ret; +} + +struct shrinker binder_shrinker = { + .count_objects = binder_shrink_count, + .scan_objects = binder_shrink_scan, + .seeks = DEFAULT_SEEKS, +}; + /** * binder_alloc_init() - called by binder_open() for per-proc initialization * @alloc: binder_alloc for this proc @@ -830,3 +952,8 @@ void binder_alloc_init(struct binder_alloc *alloc) INIT_LIST_HEAD(&alloc->buffers); } +void binder_alloc_shrinker_init(void) +{ + list_lru_init(&binder_alloc_lru); + register_shrinker(&binder_shrinker); +} diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index dd5649bf6469..fa707cc63393 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -21,7 +21,9 @@ #include #include #include +#include +extern struct list_lru binder_alloc_lru; struct binder_transaction; /** @@ -60,6 +62,18 @@ struct binder_buffer { void *data; }; +/** + * struct binder_lru_page - page object used for binder shrinker + * @page_ptr: pointer to physical page in mmap'd space + * @lru: entry in binder_alloc_lru + * @alloc: binder_alloc for a proc + */ +struct binder_lru_page { + struct list_head lru; + struct page *page_ptr; + struct binder_alloc *alloc; +}; + /** * struct binder_alloc - per-binder proc state for binder allocator * @vma: vm_area_struct passed to mmap_handler @@ -75,8 +89,7 @@ struct binder_buffer { * @allocated_buffers: rb tree of allocated buffers sorted by address * @free_async_space: VA space available for async buffers. This is * initialized at mmap time to 1/2 the full VA space - * @pages: array of physical page addresses for each - * page of mmap'd space + * @pages: array of binder_lru_page * @buffer_size: size of address space specified via mmap * @pid: pid for associated binder_proc (invariant after init) * @@ -96,7 +109,7 @@ struct binder_alloc { struct rb_root free_buffers; struct rb_root allocated_buffers; size_t free_async_space; - struct page **pages; + struct binder_lru_page *pages; size_t buffer_size; uint32_t buffer_free; int pid; @@ -107,12 +120,16 @@ void binder_selftest_alloc(struct binder_alloc *alloc); #else static inline void binder_selftest_alloc(struct binder_alloc *alloc) {} #endif +enum lru_status binder_alloc_free_page(struct list_head *item, + struct list_lru_one *lru, + spinlock_t *lock, void *cb_arg); extern struct binder_buffer *binder_alloc_new_buf(struct binder_alloc *alloc, size_t data_size, size_t offsets_size, size_t extra_buffers_size, int is_async); extern void binder_alloc_init(struct binder_alloc *alloc); +void binder_alloc_shrinker_init(void); extern void binder_alloc_vma_close(struct binder_alloc *alloc); extern struct binder_buffer * binder_alloc_prepare_to_free(struct binder_alloc *alloc, diff --git a/drivers/android/binder_alloc_selftest.c b/drivers/android/binder_alloc_selftest.c index 0bf72079a9da..8bd7bcef967d 100644 --- a/drivers/android/binder_alloc_selftest.c +++ b/drivers/android/binder_alloc_selftest.c @@ -109,9 +109,11 @@ static bool check_buffer_pages_allocated(struct binder_alloc *alloc, page_addr = buffer->data; for (; page_addr < end; page_addr += PAGE_SIZE) { page_index = (page_addr - alloc->buffer) / PAGE_SIZE; - if (!alloc->pages[page_index]) { - pr_err("incorrect alloc state at page index %d\n", - page_index); + if (!alloc->pages[page_index].page_ptr || + !list_empty(&alloc->pages[page_index].lru)) { + pr_err("expect alloc but is %s at page index %d\n", + alloc->pages[page_index].page_ptr ? + "lru" : "free", page_index); return false; } } @@ -137,28 +139,63 @@ static void binder_selftest_alloc_buf(struct binder_alloc *alloc, static void binder_selftest_free_buf(struct binder_alloc *alloc, struct binder_buffer *buffers[], - size_t *sizes, int *seq) + size_t *sizes, int *seq, size_t end) { int i; for (i = 0; i < BUFFER_NUM; i++) binder_alloc_free_buf(alloc, buffers[seq[i]]); + for (i = 0; i < end / PAGE_SIZE; i++) { + /** + * Error message on a free page can be false positive + * if binder shrinker ran during binder_alloc_free_buf + * calls above. + */ + if (list_empty(&alloc->pages[i].lru)) { + pr_err_size_seq(sizes, seq); + pr_err("expect lru but is %s at page index %d\n", + alloc->pages[i].page_ptr ? "alloc" : "free", i); + binder_selftest_failures++; + } + } +} + +static void binder_selftest_free_page(struct binder_alloc *alloc) +{ + int i; + unsigned long count; + + while ((count = list_lru_count(&binder_alloc_lru))) { + list_lru_walk(&binder_alloc_lru, binder_alloc_free_page, + NULL, count); + } + for (i = 0; i < (alloc->buffer_size / PAGE_SIZE); i++) { - if ((!alloc->pages[i]) == (i == 0)) { - pr_err("incorrect free state at page index %d\n", i); + if (alloc->pages[i].page_ptr) { + pr_err("expect free but is %s at page index %d\n", + list_empty(&alloc->pages[i].lru) ? + "alloc" : "lru", i); binder_selftest_failures++; } } } static void binder_selftest_alloc_free(struct binder_alloc *alloc, - size_t *sizes, int *seq) + size_t *sizes, int *seq, size_t end) { struct binder_buffer *buffers[BUFFER_NUM]; binder_selftest_alloc_buf(alloc, buffers, sizes, seq); - binder_selftest_free_buf(alloc, buffers, sizes, seq); + binder_selftest_free_buf(alloc, buffers, sizes, seq, end); + + /* Allocate from lru. */ + binder_selftest_alloc_buf(alloc, buffers, sizes, seq); + if (list_lru_count(&binder_alloc_lru)) + pr_err("lru list should be empty but is not\n"); + + binder_selftest_free_buf(alloc, buffers, sizes, seq, end); + binder_selftest_free_page(alloc); } static bool is_dup(int *seq, int index, int val) @@ -174,19 +211,20 @@ static bool is_dup(int *seq, int index, int val) /* Generate BUFFER_NUM factorial free orders. */ static void binder_selftest_free_seq(struct binder_alloc *alloc, - size_t *sizes, int *seq, int index) + size_t *sizes, int *seq, + int index, size_t end) { int i; if (index == BUFFER_NUM) { - binder_selftest_alloc_free(alloc, sizes, seq); + binder_selftest_alloc_free(alloc, sizes, seq, end); return; } for (i = 0; i < BUFFER_NUM; i++) { if (is_dup(seq, index, i)) continue; seq[index] = i; - binder_selftest_free_seq(alloc, sizes, seq, index + 1); + binder_selftest_free_seq(alloc, sizes, seq, index + 1, end); } } @@ -211,8 +249,9 @@ static void binder_selftest_alloc_size(struct binder_alloc *alloc, * we need one giant buffer before getting to the last page. */ back_sizes[0] += alloc->buffer_size - end_offset[BUFFER_NUM - 1]; - binder_selftest_free_seq(alloc, front_sizes, seq, 0); - binder_selftest_free_seq(alloc, back_sizes, seq, 0); + binder_selftest_free_seq(alloc, front_sizes, seq, 0, + end_offset[BUFFER_NUM - 1]); + binder_selftest_free_seq(alloc, back_sizes, seq, 0, alloc->buffer_size); } static void binder_selftest_alloc_offset(struct binder_alloc *alloc, @@ -246,7 +285,8 @@ static void binder_selftest_alloc_offset(struct binder_alloc *alloc, * * Allocate BUFFER_NUM buffers to cover all page alignment cases, * then free them in all orders possible. Check that pages are - * allocated after buffer alloc and freed after freeing buffer. + * correctly allocated, put onto lru when buffers are freed, and + * are freed when binder_alloc_free_page is called. */ void binder_selftest_alloc(struct binder_alloc *alloc) { From 2d227ce952a82c540943600f80d8252f4df83045 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Wed, 2 Aug 2017 14:02:37 -0700 Subject: [PATCH 162/546] FROMLIST: android: binder: Add shrinker tracepoints (from https://patchwork.kernel.org/patch/9928613/) Add tracepoints in binder transaction allocator to record lru hits and alloc/free page. Bug: 63926541 Change-Id: I2e24fe8e7b6534349df4a87ff865a6843ac9a30b Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 27 +++++++++++++++-- drivers/android/binder_trace.h | 55 ++++++++++++++++++++++++++++++++++ 2 files changed, 80 insertions(+), 2 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index bcdaf684a658..28dc76473ff6 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -237,18 +237,25 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, for (page_addr = start; page_addr < end; page_addr += PAGE_SIZE) { int ret; bool on_lru; + size_t index; - page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + index = (page_addr - alloc->buffer) / PAGE_SIZE; + page = &alloc->pages[index]; if (page->page_ptr) { + trace_binder_alloc_lru_start(alloc, index); + on_lru = list_lru_del(&binder_alloc_lru, &page->lru); WARN_ON(!on_lru); + + trace_binder_alloc_lru_end(alloc, index); continue; } if (WARN_ON(!vma)) goto err_page_ptr_cleared; + trace_binder_alloc_page_start(alloc, index); page->page_ptr = alloc_page(GFP_KERNEL | __GFP_HIGHMEM | __GFP_ZERO); @@ -278,6 +285,8 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, alloc->pid, user_page_addr); goto err_vm_insert_page_failed; } + + trace_binder_alloc_page_end(alloc, index); /* vm_insert_page does not seem to increment the refcount */ } if (mm) { @@ -290,11 +299,17 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, for (page_addr = end - PAGE_SIZE; page_addr >= start; page_addr -= PAGE_SIZE) { bool ret; + size_t index; - page = &alloc->pages[(page_addr - alloc->buffer) / PAGE_SIZE]; + index = (page_addr - alloc->buffer) / PAGE_SIZE; + page = &alloc->pages[index]; + + trace_binder_free_lru_start(alloc, index); ret = list_lru_add(&binder_alloc_lru, &page->lru); WARN_ON(!ret); + + trace_binder_free_lru_end(alloc, index); continue; err_vm_insert_page_failed: @@ -887,19 +902,27 @@ enum lru_status binder_alloc_free_page(struct list_head *item, if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + trace_binder_unmap_user_start(alloc, index); + zap_page_range(alloc->vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE, NULL); + trace_binder_unmap_user_end(alloc, index); + up_write(&mm->mmap_sem); mmput(mm); } + trace_binder_unmap_kernel_start(alloc, index); + unmap_kernel_range(page_addr, PAGE_SIZE); __free_page(page->page_ptr); page->page_ptr = NULL; + trace_binder_unmap_kernel_end(alloc, index); + list_lru_isolate(lru, item); mutex_unlock(&alloc->mutex); diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 7967db16ba5a..76e3b9c8a8a2 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -291,6 +291,61 @@ TRACE_EVENT(binder_update_page_range, __entry->offset, __entry->size) ); +DECLARE_EVENT_CLASS(binder_lru_page_class, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index), + TP_STRUCT__entry( + __field(int, proc) + __field(size_t, page_index) + ), + TP_fast_assign( + __entry->proc = alloc->pid; + __entry->page_index = page_index; + ), + TP_printk("proc=%d page_index=%zu", + __entry->proc, __entry->page_index) +); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_lru_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_free_lru_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_free_lru_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_alloc_page_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_user_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_start, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + +DEFINE_EVENT(binder_lru_page_class, binder_unmap_kernel_end, + TP_PROTO(const struct binder_alloc *alloc, size_t page_index), + TP_ARGS(alloc, page_index)); + TRACE_EVENT(binder_command, TP_PROTO(uint32_t cmd), TP_ARGS(cmd), From f28a37c83efec80cb1db9af987b6cbbfa1873806 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Tue, 22 Aug 2017 17:26:57 -0700 Subject: [PATCH 163/546] FROMLIST: android: binder: Add page usage in binder stats (from https://patchwork.kernel.org/patch/9928611/) Add the number of active, lru, and free pages for each binder process in binder stats Bug: 63926541 Change-Id: I12618e4eb8ecc08f4f05fe4cba454a88830897f9 Signed-off-by: Sherry Yang --- drivers/android/binder.c | 2 ++ drivers/android/binder_alloc.c | 28 ++++++++++++++++++++++++++++ drivers/android/binder_alloc.h | 2 ++ 3 files changed, 32 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 3ce96db8e5d4..799de03d75a8 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -5427,6 +5427,8 @@ static void print_binder_proc_stats(struct seq_file *m, count = binder_alloc_get_allocated_count(&proc->alloc); seq_printf(m, " buffers: %d\n", count); + binder_alloc_print_pages(m, &proc->alloc); + count = 0; binder_inner_proc_lock(proc); list_for_each_entry(w, &proc->todo, entry) { diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 28dc76473ff6..1c1a7d9c86ed 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -831,6 +831,34 @@ void binder_alloc_print_allocated(struct seq_file *m, mutex_unlock(&alloc->mutex); } +/** + * binder_alloc_print_pages() - print page usage + * @m: seq_file for output via seq_printf() + * @alloc: binder_alloc for this proc + */ +void binder_alloc_print_pages(struct seq_file *m, + struct binder_alloc *alloc) +{ + struct binder_lru_page *page; + int i; + int active = 0; + int lru = 0; + int free = 0; + + mutex_lock(&alloc->mutex); + for (i = 0; i < alloc->buffer_size / PAGE_SIZE; i++) { + page = &alloc->pages[i]; + if (!page->page_ptr) + free++; + else if (list_empty(&page->lru)) + active++; + else + lru++; + } + mutex_unlock(&alloc->mutex); + seq_printf(m, " pages: %d:%d:%d\n", active, lru, free); +} + /** * binder_alloc_get_allocated_count() - return count of buffers * @alloc: binder_alloc for this proc diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index fa707cc63393..a3a3602c689c 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -142,6 +142,8 @@ extern void binder_alloc_deferred_release(struct binder_alloc *alloc); extern int binder_alloc_get_allocated_count(struct binder_alloc *alloc); extern void binder_alloc_print_allocated(struct seq_file *m, struct binder_alloc *alloc); +void binder_alloc_print_pages(struct seq_file *m, + struct binder_alloc *alloc); /** * binder_alloc_get_free_async_space() - get free space available for async From 4771c96d16e115b0596ed71c9d185e257044d480 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 26 May 2017 11:11:47 -0700 Subject: [PATCH 164/546] sched: deadline: WALT: account cumulative runnable avg Account cumulative runnable average for WALT CPU utilization accounting. Change-Id: I56934894e626dec183740eeaf89a57d2ef638143 Signed-off-by: Joonwoo Park --- kernel/sched/deadline.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 9ec7f19b5020..2aae8b8b68e8 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -18,6 +18,8 @@ #include +#include "walt.h" + struct dl_bandwidth def_dl_bandwidth; static inline struct task_struct *dl_task_of(struct sched_dl_entity *dl_se) @@ -882,6 +884,7 @@ void inc_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_prio(prio)); dl_rq->dl_nr_running++; add_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_inc_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); inc_dl_deadline(dl_rq, deadline); inc_dl_migration(dl_se, dl_rq); @@ -896,6 +899,7 @@ void dec_dl_tasks(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) WARN_ON(!dl_rq->dl_nr_running); dl_rq->dl_nr_running--; sub_nr_running(rq_of_dl_rq(dl_rq), 1); + walt_dec_cumulative_runnable_avg(rq_of_dl_rq(dl_rq), dl_task_of(dl_se)); dec_dl_deadline(dl_rq, dl_se->deadline); dec_dl_migration(dl_se, dl_rq); From 1eca70e4c44e3ebb28672cc08618d2f5a71e3a36 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 26 May 2017 11:19:36 -0700 Subject: [PATCH 165/546] sched: WALT: fix broken cumulative runnable average accounting When running tasks's ravg.demand is changed update_history() adjusts rq->cumulative_runnable_avg to reflect change of CPU load. Currently this fixup is broken by accumulating task's new demand without subtracting the task's old demand. Fix the fixup logic to subtract the task's old demand. Change-Id: I61beb32a4850879ccb39b733f5564251e465bfeb Signed-off-by: Joonwoo Park --- kernel/sched/walt.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 92c3aae8e056..166641ed1f39 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -111,8 +111,10 @@ walt_dec_cumulative_runnable_avg(struct rq *rq, static void fixup_cumulative_runnable_avg(struct rq *rq, - struct task_struct *p, s64 task_load_delta) + struct task_struct *p, u64 new_task_load) { + s64 task_load_delta = (s64)new_task_load - task_load(p); + rq->cumulative_runnable_avg += task_load_delta; if ((s64)rq->cumulative_runnable_avg < 0) panic("cra less than zero: tld: %lld, task_load(p) = %u\n", From cf9ee81955af980a6fbec7561833608357f5e28e Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Thu, 8 Dec 2016 16:12:12 -0800 Subject: [PATCH 166/546] sched: EAS/WALT: use cr_avg instead of prev_runnable_sum WALT accounts two major statistics; CPU load and cumulative tasks demand. The CPU load which is account of accumulated each CPU's absolute execution time is for CPU frequency guidance. Whereas cumulative tasks demand which is each CPU's instantaneous load to reflect CPU's load at given time is for task placement decision. Use cumulative tasks demand for cpu_util() for task placement and introduce cpu_util_freq() for frequency guidance. Change-Id: Id928f01dbc8cb2a617cdadc584c1f658022565c5 Signed-off-by: Joonwoo Park --- kernel/sched/core.c | 2 +- kernel/sched/fair.c | 4 ++-- kernel/sched/sched.h | 16 +++++++++++++++- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 9307827cc7b1..4f97de8e0b18 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2992,7 +2992,7 @@ static void sched_freq_tick_pelt(int cpu) #ifdef CONFIG_SCHED_WALT static void sched_freq_tick_walt(int cpu) { - unsigned long cpu_utilization = cpu_util(cpu); + unsigned long cpu_utilization = cpu_util_freq(cpu); unsigned long capacity_curr = capacity_curr_of(cpu); if (walt_disabled || !sysctl_sched_use_walt_cpu_util) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index c193e9b1c38f..3641dad3d4cc 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4659,7 +4659,7 @@ static inline void hrtick_update(struct rq *rq) static bool cpu_overutilized(int cpu); unsigned long boosted_cpu_util(int cpu); #else -#define boosted_cpu_util(cpu) cpu_util(cpu) +#define boosted_cpu_util(cpu) cpu_util_freq(cpu) #endif #ifdef CONFIG_SMP @@ -5937,7 +5937,7 @@ schedtune_task_margin(struct task_struct *task) unsigned long boosted_cpu_util(int cpu) { - unsigned long util = cpu_util(cpu); + unsigned long util = cpu_util_freq(cpu); long margin = schedtune_cpu_margin(util, cpu); trace_sched_boost_cpu(cpu, util, margin); diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 029cf2bbeda2..73077f535e95 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1592,7 +1592,7 @@ static inline unsigned long __cpu_util(int cpu, int delta) #ifdef CONFIG_SCHED_WALT if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_LOAD_SHIFT; do_div(util, walt_ravg_window); } #endif @@ -1608,6 +1608,20 @@ static inline unsigned long cpu_util(int cpu) return __cpu_util(cpu, 0); } +static inline unsigned long cpu_util_freq(int cpu) +{ + unsigned long util = cpu_rq(cpu)->cfs.avg.util_avg; + unsigned long capacity = capacity_orig_of(cpu); + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { + util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; + do_div(util, walt_ravg_window); + } +#endif + return (util >= capacity) ? capacity : util; +} + #endif #ifdef CONFIG_CPU_FREQ_GOV_SCHED From 55f3247db95210c061ed3e24bfea1130b59f867d Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Mon, 16 Jan 2017 18:09:20 -0800 Subject: [PATCH 167/546] sched: EAS: schedfreq: fix CPU util over estimation WALT CPU utilization reports CPU load of all the scheduler classes. Therefore adding RT class's load additionally will cause frequency overshooting. Fix such issue by not accounting RT class load when requesting capacity. Change-Id: I29600d7af7ca8c00e0d2ff1e13872024ccaa72bf Signed-off-by: Joonwoo Park --- kernel/sched/cpufreq_sched.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/kernel/sched/cpufreq_sched.c b/kernel/sched/cpufreq_sched.c index f10d9f7d6d07..6ffb23adbcef 100644 --- a/kernel/sched/cpufreq_sched.c +++ b/kernel/sched/cpufreq_sched.c @@ -235,6 +235,18 @@ static void update_fdomain_capacity_request(int cpu) cpufreq_cpu_put(policy); } +#ifdef CONFIG_SCHED_WALT +static inline unsigned long +requested_capacity(struct sched_capacity_reqs *scr) +{ + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + return scr->cfs; + return scr->cfs + scr->rt; +} +#else +#define requested_capacity(scr) (scr->cfs + scr->rt) +#endif + void update_cpu_capacity_request(int cpu, bool request) { unsigned long new_capacity; @@ -245,7 +257,7 @@ void update_cpu_capacity_request(int cpu, bool request) scr = &per_cpu(cpu_sched_capacity_reqs, cpu); - new_capacity = scr->cfs + scr->rt; + new_capacity = requested_capacity(scr); new_capacity = new_capacity * capacity_margin / SCHED_CAPACITY_SCALE; new_capacity += scr->dl; From 9305d63017439c2cb0ae4c2c98b5a379c5d7b9d2 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Fri, 20 Jan 2017 11:10:15 -0800 Subject: [PATCH 168/546] sched: WALT: fix potential overflow Task demand and CPU util are in u64. Change-Id: If7ec1623e723026d3346201122aab0303a6d2ba2 Signed-off-by: Joonwoo Park --- kernel/sched/sched.h | 14 ++++++-------- 1 file changed, 6 insertions(+), 8 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 73077f535e95..d4613c5be81d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1591,10 +1591,9 @@ static inline unsigned long __cpu_util(int cpu, int delta) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - util = cpu_rq(cpu)->cumulative_runnable_avg << SCHED_LOAD_SHIFT; - do_div(util, walt_ravg_window); - } + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = div64_u64(cpu_rq(cpu)->cumulative_runnable_avg, + walt_ravg_window >> SCHED_LOAD_SHIFT); #endif delta += util; if (delta < 0) @@ -1614,10 +1613,9 @@ static inline unsigned long cpu_util_freq(int cpu) unsigned long capacity = capacity_orig_of(cpu); #ifdef CONFIG_SCHED_WALT - if (!walt_disabled && sysctl_sched_use_walt_cpu_util) { - util = cpu_rq(cpu)->prev_runnable_sum << SCHED_LOAD_SHIFT; - do_div(util, walt_ravg_window); - } + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + util = div64_u64(cpu_rq(cpu)->prev_runnable_sum, + walt_ravg_window >> SCHED_LOAD_SHIFT); #endif return (util >= capacity) ? capacity : util; } From d6614c4e2c49df3cc32a77c3b3a90fff1b4df597 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Wed, 25 Jan 2017 17:07:19 -0800 Subject: [PATCH 169/546] cpufreq: sched: WALT: don't apply capacity margin twice With WALT all the scheduler classes' load are accounted in scr->cfs and update_cpu_capacity_request() adds capacity margin. At present, at tick path, scheduler also adds capacity margin. Therefore the margin applied twice. Fix such error by using margin applied cpu utilization only for checking whether frequency increase is needed. Change-Id: Id7d8cc73b2e4eec70b274ca66e09bb0b16bf6f09 Signed-off-by: Joonwoo Park (trivial rebase conflict) Signed-off-by: Chris Redpath --- kernel/sched/core.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 4f97de8e0b18..0b14b4634b8c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2999,13 +2999,13 @@ static void sched_freq_tick_walt(int cpu) return sched_freq_tick_pelt(cpu); /* - * Add a margin to the WALT utilization. + * Add a margin to the WALT utilization to check if we will need to + * increase frequency. * NOTE: WALT tracks a single CPU signal for all the scheduling * classes, thus this margin is going to be added to the DL class as * well, which is something we do not do in sched_freq_tick_pelt case. */ - cpu_utilization = add_capacity_margin(cpu_utilization); - if (cpu_utilization <= capacity_curr) + if (add_capacity_margin(cpu_utilization) <= capacity_curr) return; /* From 77ce105b15fb81b1aa1684a82f8a8d03e4eb22b1 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Wed, 25 Jan 2017 17:45:56 -0800 Subject: [PATCH 170/546] sched: EAS/WALT: take into account of waking task's load WALT's function cpu_util(cpu) reports CPU's load without taking into account of waking task's load. Thus currently cpu_overutilized() underestimates load on the previous CPU of waking task. Take into account of task's load to determine whether previous CPU is overutilzed to bail out early without running energy_diff() which is expensive. Change-Id: I30f146984a880ad2cc1b8a4ce35bd239a8c9a607 Signed-off-by: Joonwoo Park (minor rebase conflicts) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 3641dad3d4cc..94bb5786d8a7 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4656,6 +4656,7 @@ static inline void hrtick_update(struct rq *rq) #endif #ifdef CONFIG_SMP +static bool __cpu_overutilized(int cpu, int delta); static bool cpu_overutilized(int cpu); unsigned long boosted_cpu_util(int cpu); #else @@ -5856,9 +5857,14 @@ static inline bool task_fits_max(struct task_struct *p, int cpu) return __task_fits(p, cpu, 0); } +static bool __cpu_overutilized(int cpu, int delta) +{ + return (capacity_of(cpu) * 1024) < ((cpu_util(cpu) + delta) * capacity_margin); +} + static bool cpu_overutilized(int cpu) { - return (capacity_of(cpu) * 1024) < (cpu_util(cpu) * capacity_margin); + return __cpu_overutilized(cpu, 0); } #ifdef CONFIG_SCHED_TUNE @@ -6577,6 +6583,7 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync } if (target_cpu != prev_cpu) { + int delta = 0; struct energy_env eenv = { .util_delta = task_util(p), .src_cpu = prev_cpu, @@ -6584,8 +6591,13 @@ static int select_energy_cpu_brute(struct task_struct *p, int prev_cpu, int sync .task = p, }; + +#ifdef CONFIG_SCHED_WALT + if (!walt_disabled && sysctl_sched_use_walt_cpu_util) + delta = task_util(p); +#endif /* Not enough spare capacity on previous cpu */ - if (cpu_overutilized(prev_cpu)) { + if (__cpu_overutilized(prev_cpu, delta)) { schedstat_inc(p, se.statistics.nr_wakeups_secb_insuff_cap); schedstat_inc(this_rq(), eas_stats.secb_insuff_cap); goto unlock; From 9b792188f9a6f89d93f142af171ad663f476ce9e Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Mon, 24 Jul 2017 13:23:05 +0100 Subject: [PATCH 171/546] sched: EAS: fix incorrect energy delta calculation due to rounding error In order to calculate energy difference we currently iterates CPUs under the same sched doamin to accumulate total energy cost and compare before and after : for_each_domain(cpu) total_energy_before += (cpu_util * power) >> SCHED_CAPACITY_SHIFT; for_each_domain(cpu) total_energy_after += (cpu_util * power) >> SCHED_CAPACITY_SHIFT; Doing such can incorrectly calculate and report abs(delta) > 0 when there is actually no energy delta between before and after because the same total accumulated cpu_util of all the CPUs can be distributed differently before and after and it causes different amount of rounding error. Fix such incorrectness by shifting just once with accumulated total_energy. Change-Id: I82f1e2e358367058960938b4ef81714f57e921cf Signed-off-by: Joonwoo Park (moved part to another commit) Signed-off-by: Chris Redpath --- kernel/sched/fair.c | 18 +++++++----------- 1 file changed, 7 insertions(+), 11 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 94bb5786d8a7..61c8952c7aaf 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5473,10 +5473,8 @@ static int group_idle_state(struct energy_env *eenv, struct sched_group *sg) */ static int sched_group_energy(struct energy_env *eenv) { - struct sched_domain *sd; - int cpu, total_energy = 0; struct cpumask visit_cpus; - struct sched_group *sg; + u64 total_energy = 0; WARN_ON(!eenv->sg_top->sge); @@ -5484,8 +5482,8 @@ static int sched_group_energy(struct energy_env *eenv) while (!cpumask_empty(&visit_cpus)) { struct sched_group *sg_shared_cap = NULL; - - cpu = cpumask_first(&visit_cpus); + int cpu = cpumask_first(&visit_cpus); + struct sched_domain *sd; /* * Is the group utilization affected by cpus outside this @@ -5497,7 +5495,7 @@ static int sched_group_energy(struct energy_env *eenv) sg_shared_cap = sd->parent->groups; for_each_domain(cpu, sd) { - sg = sd->groups; + struct sched_group *sg = sd->groups; /* Has this sched_domain already been visited? */ if (sd->child && group_first_cpu(sg) != cpu) @@ -5533,11 +5531,9 @@ static int sched_group_energy(struct energy_env *eenv) idle_idx = group_idle_state(eenv, sg); group_util = group_norm_util(eenv, sg); - sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power) - >> SCHED_CAPACITY_SHIFT; + sg_busy_energy = (group_util * sg->sge->cap_states[cap_idx].power); sg_idle_energy = ((SCHED_LOAD_SCALE-group_util) - * sg->sge->idle_states[idle_idx].power) - >> SCHED_CAPACITY_SHIFT; + * sg->sge->idle_states[idle_idx].power); total_energy += sg_busy_energy + sg_idle_energy; @@ -5562,7 +5558,7 @@ static int sched_group_energy(struct energy_env *eenv) continue; } - eenv->energy = total_energy; + eenv->energy = total_energy >> SCHED_CAPACITY_SHIFT; return 0; } From d5a3aedb50a09bd82f9ca154dca8c014a492d3d1 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Tue, 7 Mar 2017 18:08:24 -0800 Subject: [PATCH 172/546] sched: EAS: kill incorrect nohz idle cpu kick EAS won't allow NOHZ idle balancer until CPU's over utilized. However nohz_kick_needed() can return true. This causes idle CPU wake up for nothing. Change-Id: I6e548442e29e4f85cda695e4c7101dd591b12fe6 Signed-off-by: Joonwoo Park --- kernel/sched/fair.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 61c8952c7aaf..0f43ece69e8c 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -9768,12 +9768,12 @@ static inline bool nohz_kick_needed(struct rq *rq) return true; /* Do idle load balance if there have misfit task */ - if (energy_aware() && rq->misfit_task) - return true; + if (energy_aware()) + return rq->misfit_task; rcu_read_lock(); sd = rcu_dereference(per_cpu(sd_busy, cpu)); - if (sd && !energy_aware()) { + if (sd) { sgc = sd->groups->sgc; nr_busy = atomic_read(&sgc->nr_busy_cpus); From 7530f3548ddbfd3ac4b6d2e1dbf4e529db871dc0 Mon Sep 17 00:00:00 2001 From: Joonwoo Park Date: Tue, 16 May 2017 11:13:00 -0700 Subject: [PATCH 173/546] sched: WALT: fix window mis-alignment The initial window start needs to be close to ktime ns = 0 to be aligned with scheduler tick. Change-Id: Ia91f74efce2f910106622a054a6fcd507e763ca5 Signed-off-by: Joonwoo Park --- kernel/sched/walt.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/walt.c b/kernel/sched/walt.c index 166641ed1f39..28e999554463 100644 --- a/kernel/sched/walt.c +++ b/kernel/sched/walt.c @@ -804,11 +804,11 @@ void walt_set_window_start(struct rq *rq) int cpu = cpu_of(rq); struct rq *sync_rq = cpu_rq(sync_cpu); - if (rq->window_start) + if (likely(rq->window_start)) return; if (cpu == sync_cpu) { - rq->window_start = walt_ktime_clock(); + rq->window_start = 1; } else { raw_spin_unlock(&rq->lock); double_rq_lock(rq, sync_rq); From 52d7f79173dc532076863f5d9762009f028f84f9 Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Mon, 22 May 2017 11:26:23 -0700 Subject: [PATCH 174/546] FROMLIST: binder: fix memory corruption in binder_transaction binder (from https://patchwork.kernel.org/patch/9939405/) commit 7a4408c6bd3e ("binder: make sure accesses to proc/thread are safe") made a change to enqueue tcomplete to thread->todo before enqueuing the transaction. However, in err_dead_proc_or_thread case, the tcomplete is directly freed, without dequeued. It may cause the thread->todo list to be corrupted. So, dequeue it before freeing. Bug: 65333488 Change-Id: Id063a4db18deaa634f4d44aa6ebca47bea32537a Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos --- drivers/android/binder.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 799de03d75a8..810ff70104c1 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -3253,6 +3253,7 @@ static void binder_transaction(struct binder_proc *proc, err_dead_proc_or_thread: return_error = BR_DEAD_REPLY; return_error_line = __LINE__; + binder_dequeue_work(proc, tcomplete); err_translate_failed: err_bad_object_type: err_bad_offset: From bfb37a2c258470f68b1879c3fb24acf992b9823a Mon Sep 17 00:00:00 2001 From: Xu YiPing Date: Tue, 5 Sep 2017 10:00:59 -0700 Subject: [PATCH 175/546] FROMLIST: binder: fix an ret value override (from https://patchwork.kernel.org/patch/9939409/) commit 372e3147df70 ("binder: guarantee txn complete / errors delivered in-order") incorrectly defined a local ret value. This ret value will be invalid when out of the if block Change-Id: If7bd963ac7e67d135aa949133263aac27bf15d1a Signed-off-by: Xu YiPing Signed-off-by: Todd Kjos --- drivers/android/binder.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 810ff70104c1..7c9f9dde8397 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2481,7 +2481,6 @@ static int binder_translate_handle(struct flat_binder_object *fp, (u64)node->ptr); binder_node_unlock(node); } else { - int ret; struct binder_ref_data dest_rdata; binder_node_unlock(node); From aa603fc147cbc2460475461566b537d6dfd8063b Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 11 Apr 2017 00:20:41 +0200 Subject: [PATCH 176/546] BACKPORT: cpufreq: schedutil: Use policy-dependent transition delays Make the schedutil governor take the initial (default) value of the rate_limit_us sysfs attribute from the (new) transition_delay_us policy parameter (to be set by the scaling driver). That will allow scaling drivers to make schedutil use smaller default values of rate_limit_us and reduce the default average time interval between consecutive frequency changes. Make intel_pstate set transition_delay_us to 500. BACKPORT: Modified to support the separate up_rate_limit_us and down_rate_limit_us (upstream just has a single rate_limit_us). Also dropped the changes for intel_pstate as there's a merge conflict. Change-Id: I62a8543879a4d8582cdcb31ebd55607705d1c8b1 Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar (cherry picked from commit 1b72e7fd304639f1cd49d1e11955c4974936d88c) Signed-off-by: Brendan Jackman --- include/linux/cpufreq.h | 10 +++++++++- kernel/sched/cpufreq_schedutil.c | 20 +++++++++++++------- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h index b144e7445477..490d577aa144 100644 --- a/include/linux/cpufreq.h +++ b/include/linux/cpufreq.h @@ -119,7 +119,15 @@ struct cpufreq_policy { bool fast_switch_possible; bool fast_switch_enabled; - /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ + /* + * Preferred average time interval between consecutive invocations of + * the driver to set the frequency for this policy. To be set by the + * scaling driver (0, which is the default, means no preference). + */ + unsigned int up_transition_delay_us; + unsigned int down_transition_delay_us; + + /* Cached frequency lookup from cpufreq_driver_resolve_freq. */ unsigned int cached_target_freq; int cached_resolved_idx; diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index a0bcf488fb81..b90f7434e13b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -615,7 +615,6 @@ static int sugov_init(struct cpufreq_policy *policy) { struct sugov_policy *sg_policy; struct sugov_tunables *tunables; - unsigned int lat; int ret = 0; /* State should be equivalent to EXIT */ @@ -654,12 +653,19 @@ static int sugov_init(struct cpufreq_policy *policy) goto stop_kthread; } - tunables->up_rate_limit_us = LATENCY_MULTIPLIER; - tunables->down_rate_limit_us = LATENCY_MULTIPLIER; - lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; - if (lat) { - tunables->up_rate_limit_us *= lat; - tunables->down_rate_limit_us *= lat; + if (policy->up_transition_delay_us && policy->down_transition_delay_us) { + tunables->up_rate_limit_us = policy->up_transition_delay_us; + tunables->down_rate_limit_us = policy->down_transition_delay_us; + } else { + unsigned int lat; + + tunables->up_rate_limit_us = LATENCY_MULTIPLIER; + tunables->down_rate_limit_us = LATENCY_MULTIPLIER; + lat = policy->cpuinfo.transition_latency / NSEC_PER_USEC; + if (lat) { + tunables->up_rate_limit_us *= lat; + tunables->down_rate_limit_us *= lat; + } } policy->governor_data = sg_policy; From d5a094a32657181ee432f0ab79a5f8c7e987a27a Mon Sep 17 00:00:00 2001 From: Brendan Jackman Date: Thu, 17 Aug 2017 14:38:21 +0100 Subject: [PATCH 177/546] ANDROID: cpufreq-dt: Set sane defaults for schedutil rate limits schedfreq used transition_latency as the default value for rate-limiting upward frequency transitions. schedutil instead uses a separate transition_delay_us field - if this is not set, it uses transition_latency * 1000, which is not sensible. The reason for the two separate values (transition_delay and transition_latency) is that they represent different quantities: "latency" reports how long it takes to execute a frequency transition. "delay" is a tunable whose ideal value is one that best balances the power cost of making a frequency transition with the benefit of more often selecting the correct frequency. "latency" is probably a lower bound on "delay". AFAIK we don't currently have a way to directly express the desired transition_delay_us field in the device tree, so for now let's just set it to the same as transition_latency, so we get similar default behaviour for schedutil as we got for schedfreq. This doesn't make any difference if userspace is setting the cpufreq tunables itself, it is just making the defaults saner. Change-Id: Iec92d710c79d9c10d0bef1b617942185448fc214 Signed-off-by: Brendan Jackman --- drivers/cpufreq/cpufreq-dt.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index f951f911786e..ebe3567c2cda 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -340,6 +340,13 @@ static void cpufreq_ready(struct cpufreq_policy *policy) } } + /* + * Android: set default parameters for parity between schedutil and + * schedfreq + */ + policy->up_transition_delay_us = transition_latency / NSEC_PER_USEC; + policy->down_transition_delay_us = 50000; /* 50ms */ + of_node_put(np); } From 8fa488de71429c02ad7912a9b4d9d28b7feebd80 Mon Sep 17 00:00:00 2001 From: Greg Kaiser Date: Tue, 5 Sep 2017 15:55:41 -0700 Subject: [PATCH 178/546] ANDROID: fiq_debugger: Fix minor bug in code We fix a typo in the code which had us comparing a pointer instead of the value which was being pointed to. This turns out to be a relatively benign bug, as we'd incorrectly pass in the empty string instead of NULL to the function, and the function can handle both. But we fix it so the code is clearly doing what we intend. Signed-off-by: Greg Kaiser Change-Id: Ib059819775a3bebca357d4ce684be779853156e3 --- drivers/staging/android/fiq_debugger/fiq_debugger.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/staging/android/fiq_debugger/fiq_debugger.c b/drivers/staging/android/fiq_debugger/fiq_debugger.c index b132cff14f01..ef9fa483ac00 100644 --- a/drivers/staging/android/fiq_debugger/fiq_debugger.c +++ b/drivers/staging/android/fiq_debugger/fiq_debugger.c @@ -401,7 +401,7 @@ static void fiq_debugger_work(struct work_struct *work) cmd += 6; while (*cmd == ' ') cmd++; - if (cmd != '\0') + if (*cmd != '\0') kernel_restart(cmd); else kernel_restart(NULL); From 632ae2677b661de0eb7dfceb7fdea81f9dc56ac8 Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 24 Aug 2017 16:57:20 -0700 Subject: [PATCH 179/546] Revert "ANDROID: Use sk_uid to replace uid get from socket file" This reverts commit 623f33f213deb39994decbdc7d3b8cea82c4d558. Bug: 37524657 Change-Id: I214dbf417e720c032d28d62b8a74ece5de9cd919 Signed-off-by: Chenbo Feng --- net/netfilter/xt_qtaguid.c | 50 +++++++++++++++++------------ net/netfilter/xt_qtaguid_internal.h | 4 +-- 2 files changed, 31 insertions(+), 23 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index fbe4e1ecce6a..f95aba44e649 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1717,9 +1717,18 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) } MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); + if (sk != NULL) { + set_sk_callback_lock = true; + read_lock_bh(&sk->sk_callback_lock); + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); + } - - if (sk == NULL) { + if (sk == NULL || sk->sk_socket == NULL) { /* * Here, the qtaguid_find_sk() using connection tracking * couldn't find the owner, so for now we just count them @@ -1732,7 +1741,9 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) */ if (!(info->match & XT_QTAGUID_UID)) account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); + MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", + par->hooknum, + sk ? sk->sk_socket : NULL); res = (info->match ^ info->invert) == 0; atomic64_inc(&qtu_events.match_no_sk); goto put_sock_ret_res; @@ -1740,7 +1751,16 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) res = false; goto put_sock_ret_res; } - sock_uid = sk->sk_uid; + filp = sk->sk_socket->file; + if (filp == NULL) { + MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); + account_for_uid(skb, sk, 0, par); + res = ((info->match ^ info->invert) & + (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; + atomic64_inc(&qtu_events.match_no_sk_file); + goto put_sock_ret_res; + } + sock_uid = filp->f_cred->fsuid; /* * TODO: unhack how to force just accounting. * For now we only do iface stats when the uid-owner is not requested @@ -1758,8 +1778,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); - if ((uid_gte(sk->sk_uid, uid_min) && - uid_lte(sk->sk_uid, uid_max)) ^ + if ((uid_gte(filp->f_cred->fsuid, uid_min) && + uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_QTAGUID_UID)) { MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", par->hooknum); @@ -1770,19 +1790,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->match & XT_QTAGUID_GID) { kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); - set_sk_callback_lock = true; - read_lock_bh(&sk->sk_callback_lock); - MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", - par->hooknum, sk, sk->sk_socket, - sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - if (!filp) { - res = ((info->match ^ info->invert) & XT_QTAGUID_GID) == 0; - atomic64_inc(&qtu_events.match_no_sk_gid); - goto put_sock_ret_res; - } - MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); + if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_QTAGUID_GID)) { @@ -1954,7 +1962,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) "match_found_sk_in_ct=%llu " "match_found_no_sk_in_ct=%llu " "match_no_sk=%llu " - "match_no_sk_gid=%llu\n", + "match_no_sk_file=%llu\n", (u64)atomic64_read(&qtu_events.sockets_tagged), (u64)atomic64_read(&qtu_events.sockets_untagged), (u64)atomic64_read(&qtu_events.counter_set_changes), @@ -1966,7 +1974,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) (u64)atomic64_read(&qtu_events.match_found_sk_in_ct), (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct), (u64)atomic64_read(&qtu_events.match_no_sk), - (u64)atomic64_read(&qtu_events.match_no_sk_gid)); + (u64)atomic64_read(&qtu_events.match_no_sk_file)); /* Count the following as part of the last item_index. No need * to lock the sock_tag_list here since it is already locked when diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h index c7052707a6a4..8178fbdfb036 100644 --- a/net/netfilter/xt_qtaguid_internal.h +++ b/net/netfilter/xt_qtaguid_internal.h @@ -289,10 +289,10 @@ struct qtaguid_event_counts { */ atomic64_t match_no_sk; /* - * The file ptr in the sk_socket wasn't there and we couldn't get GID. + * The file ptr in the sk_socket wasn't there. * This might happen for traffic while the socket is being closed. */ - atomic64_t match_no_sk_gid; + atomic64_t match_no_sk_file; }; /* Track the set active_set for the given tag. */ From d12b0da7ff78eb69478b4efd8c86e151eb20136d Mon Sep 17 00:00:00 2001 From: JP Abgrall Date: Fri, 20 Dec 2013 16:51:11 -0800 Subject: [PATCH 180/546] ANDROID: nf: xt_qtaguid: fix handling for cases where tunnels are used. * fix skb->dev vs par->in/out When there is some forwarding going on, it introduces extra state around devs associated with xt_action_param->in/out and sk_buff->dev. E.g. par->in and par->out are both set, or skb->dev and par->out are both set (and different) This would lead qtaguid to make the wrong assumption about the direction and update the wrong device stats. Now we rely more on par->in/out. * Fix handling when qtaguid is used as "owner" When qtaguid is used as an owner module, and sk_socket->file is not there (happens when tunnels are involved), it would incorrectly do a tag stats update. * Correct debug messages. Bug: 11687690 Change-Id: I2b1ff8bd7131969ce9e25f8291d83a6280b3ba7f CRs-Fixed: 747810 Signed-off-by: JP Abgrall Git-commit: 2b71479d6f5fe8f33b335f713380f72037244395 Git-repo: https://www.codeaurora.org/cgit/quic/la/kernel/mediatek [imaund@codeaurora.org: Resolved trivial context conflicts.] Signed-off-by: Ian Maund [bflowers@codeaurora.org: Resolved merge conflicts] Signed-off-by: Bryse Flowers Signed-off-by: Chenbo Feng --- net/netfilter/xt_qtaguid.c | 150 +++++++++++++++++-------------------- 1 file changed, 69 insertions(+), 81 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index f95aba44e649..fef69cb21f6f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1175,6 +1175,38 @@ static void iface_stat_update(struct net_device *net_dev, bool stash_only) spin_unlock_bh(&iface_stat_list_lock); } +/* Guarantied to return a net_device that has a name */ +static void get_dev_and_dir(const struct sk_buff *skb, + struct xt_action_param *par, + enum ifs_tx_rx *direction, + const struct net_device **el_dev) +{ + BUG_ON(!direction || !el_dev); + + if (par->in) { + *el_dev = par->in; + *direction = IFS_RX; + } else if (par->out) { + *el_dev = par->out; + *direction = IFS_TX; + } else { + pr_err("qtaguid[%d]: %s(): no par->in/out?!!\n", + par->hooknum, __func__); + BUG(); + } + if (unlikely(!(*el_dev)->name)) { + pr_err("qtaguid[%d]: %s(): no dev->name?!!\n", + par->hooknum, __func__); + BUG(); + } + if (skb->dev && *el_dev != skb->dev) { + MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs par->%s=%p %s\n", + par->hooknum, skb->dev, skb->dev->name, + *direction == IFS_RX ? "in" : "out", *el_dev, + (*el_dev)->name); + } +} + /* * Update stats for the specified interface from the skb. * Do nothing if the entry @@ -1186,50 +1218,27 @@ static void iface_stat_update_from_skb(const struct sk_buff *skb, { struct iface_stat *entry; const struct net_device *el_dev; - enum ifs_tx_rx direction = par->in ? IFS_RX : IFS_TX; + enum ifs_tx_rx direction; int bytes = skb->len; int proto; - if (!skb->dev) { - MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); - el_dev = par->in ? : par->out; - } else { - const struct net_device *other_dev; - el_dev = skb->dev; - other_dev = par->in ? : par->out; - if (el_dev != other_dev) { - MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " - "par->(in/out)=%p %s\n", - par->hooknum, el_dev, el_dev->name, other_dev, - other_dev->name); - } - } - - if (unlikely(!el_dev)) { - pr_err_ratelimited("qtaguid[%d]: %s(): no par->in/out?!!\n", - par->hooknum, __func__); - BUG(); - } else if (unlikely(!el_dev->name)) { - pr_err_ratelimited("qtaguid[%d]: %s(): no dev->name?!!\n", - par->hooknum, __func__); - BUG(); - } else { - proto = ipx_proto(skb, par); - MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n", - par->hooknum, el_dev->name, el_dev->type, - par->family, proto); - } + get_dev_and_dir(skb, par, &direction, &el_dev); + proto = ipx_proto(skb, par); + MT_DEBUG("qtaguid[%d]: iface_stat: %s(%s): " + "type=%d fam=%d proto=%d dir=%d\n", + par->hooknum, __func__, el_dev->name, el_dev->type, + par->family, proto, direction); spin_lock_bh(&iface_stat_list_lock); entry = get_iface_entry(el_dev->name); if (entry == NULL) { - IF_DEBUG("qtaguid: iface_stat: %s(%s): not tracked\n", - __func__, el_dev->name); + IF_DEBUG("qtaguid[%d]: iface_stat: %s(%s): not tracked\n", + par->hooknum, __func__, el_dev->name); spin_unlock_bh(&iface_stat_list_lock); return; } - IF_DEBUG("qtaguid: %s(%s): entry=%p\n", __func__, + IF_DEBUG("qtaguid[%d]: %s(%s): entry=%p\n", par->hooknum, __func__, el_dev->name, entry); data_counters_update(&entry->totals_via_skb, 0, direction, proto, @@ -1294,14 +1303,14 @@ static void if_tag_stat_update(const char *ifname, uid_t uid, spin_lock_bh(&iface_stat_list_lock); iface_entry = get_iface_entry(ifname); if (!iface_entry) { - pr_err_ratelimited("qtaguid: iface_stat: stat_update() " + pr_err_ratelimited("qtaguid: tag_stat: stat_update() " "%s not found\n", ifname); spin_unlock_bh(&iface_stat_list_lock); return; } /* It is ok to process data when an iface_entry is inactive */ - MT_DEBUG("qtaguid: iface_stat: stat_update() dev=%s entry=%p\n", + MT_DEBUG("qtaguid: tag_stat: stat_update() dev=%s entry=%p\n", ifname, iface_entry); /* @@ -1318,7 +1327,7 @@ static void if_tag_stat_update(const char *ifname, uid_t uid, tag = combine_atag_with_uid(acct_tag, uid); uid_tag = make_tag_from_uid(uid); } - MT_DEBUG("qtaguid: iface_stat: stat_update(): " + MT_DEBUG("qtaguid: tag_stat: stat_update(): " " looking for tag=0x%llx (uid=%u) in ife=%p\n", tag, get_uid_from_tag(tag), iface_entry); /* Loop over tag list under this interface for {acct_tag,uid_tag} */ @@ -1578,8 +1587,8 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, struct sock *sk; unsigned int hook_mask = (1 << par->hooknum); - MT_DEBUG("qtaguid: find_sk(skb=%p) hooknum=%d family=%d\n", skb, - par->hooknum, par->family); + MT_DEBUG("qtaguid[%d]: find_sk(skb=%p) family=%d\n", + par->hooknum, skb, par->family); /* * Let's not abuse the the xt_socket_get*_sk(), or else it will @@ -1600,8 +1609,8 @@ static struct sock *qtaguid_find_sk(const struct sk_buff *skb, } if (sk) { - MT_DEBUG("qtaguid: %p->sk_proto=%u " - "->sk_state=%d\n", sk, sk->sk_protocol, sk->sk_state); + MT_DEBUG("qtaguid[%d]: %p->sk_proto=%u->sk_state=%d\n", + par->hooknum, sk, sk->sk_protocol, sk->sk_state); /* * When in TCP_TIME_WAIT the sk is not a "struct sock" but * "struct inet_timewait_sock" which is missing fields. @@ -1619,37 +1628,19 @@ static void account_for_uid(const struct sk_buff *skb, struct xt_action_param *par) { const struct net_device *el_dev; + enum ifs_tx_rx direction; + int proto; - if (!skb->dev) { - MT_DEBUG("qtaguid[%d]: no skb->dev\n", par->hooknum); - el_dev = par->in ? : par->out; - } else { - const struct net_device *other_dev; - el_dev = skb->dev; - other_dev = par->in ? : par->out; - if (el_dev != other_dev) { - MT_DEBUG("qtaguid[%d]: skb->dev=%p %s vs " - "par->(in/out)=%p %s\n", - par->hooknum, el_dev, el_dev->name, other_dev, - other_dev->name); - } - } + get_dev_and_dir(skb, par, &direction, &el_dev); + proto = ipx_proto(skb, par); + MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d dir=%d\n", + par->hooknum, el_dev->name, el_dev->type, + par->family, proto, direction); - if (unlikely(!el_dev)) { - pr_info("qtaguid[%d]: no par->in/out?!!\n", par->hooknum); - } else if (unlikely(!el_dev->name)) { - pr_info("qtaguid[%d]: no dev->name?!!\n", par->hooknum); - } else { - int proto = ipx_proto(skb, par); - MT_DEBUG("qtaguid[%d]: dev name=%s type=%d fam=%d proto=%d\n", - par->hooknum, el_dev->name, el_dev->type, - par->family, proto); - - if_tag_stat_update(el_dev->name, uid, - skb->sk ? skb->sk : alternate_sk, - par->in ? IFS_RX : IFS_TX, - proto, skb->len); - } + if_tag_stat_update(el_dev->name, uid, + skb->sk ? skb->sk : alternate_sk, + direction, + proto, skb->len); } static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) @@ -1661,6 +1652,11 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t sock_uid; bool res; bool set_sk_callback_lock = false; + /* + * TODO: unhack how to force just accounting. + * For now we only do tag stats when the uid-owner is not requested + */ + bool do_tag_stat = !(info->match & XT_QTAGUID_UID); if (unlikely(module_passive)) return (info->match ^ info->invert) == 0; @@ -1734,12 +1730,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) * couldn't find the owner, so for now we just count them * against the system. */ - /* - * TODO: unhack how to force just accounting. - * For now we only do iface stats when the uid-owner is not - * requested. - */ - if (!(info->match & XT_QTAGUID_UID)) + if (do_tag_stat) account_for_uid(skb, sk, 0, par); MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", par->hooknum, @@ -1754,18 +1745,15 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) filp = sk->sk_socket->file; if (filp == NULL) { MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); - account_for_uid(skb, sk, 0, par); + if (do_tag_stat) + account_for_uid(skb, sk, 0, par); res = ((info->match ^ info->invert) & (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; atomic64_inc(&qtu_events.match_no_sk_file); goto put_sock_ret_res; } sock_uid = filp->f_cred->fsuid; - /* - * TODO: unhack how to force just accounting. - * For now we only do iface stats when the uid-owner is not requested - */ - if (!(info->match & XT_QTAGUID_UID)) + if (do_tag_stat) account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par); /* From a7173810070a0edeb51f7b82d8e65b5ea636825a Mon Sep 17 00:00:00 2001 From: Chenbo Feng Date: Thu, 20 Apr 2017 18:54:13 -0700 Subject: [PATCH 181/546] ANDROID: Use sk_uid to replace uid get from socket file Retrieve socket uid from the sk_uid field added to struct sk instead of read it from sk->socket->file. It prevent the packet been dropped when the socket file doesn't exist. Bug: 37524657 Signed-off-by: Chenbo Feng Change-Id: Ic58239c1f9aa7e0eb1d4d1c09d40b845fd4e8e57 --- net/netfilter/xt_qtaguid.c | 55 +++++++++++++---------------- net/netfilter/xt_qtaguid_internal.h | 4 +-- 2 files changed, 26 insertions(+), 33 deletions(-) diff --git a/net/netfilter/xt_qtaguid.c b/net/netfilter/xt_qtaguid.c index fef69cb21f6f..6f5cdc5b505f 100644 --- a/net/netfilter/xt_qtaguid.c +++ b/net/netfilter/xt_qtaguid.c @@ -1713,18 +1713,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) } MT_DEBUG("qtaguid[%d]: sk=%p got_sock=%d fam=%d proto=%d\n", par->hooknum, sk, got_sock, par->family, ipx_proto(skb, par)); - if (sk != NULL) { - set_sk_callback_lock = true; - read_lock_bh(&sk->sk_callback_lock); - MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", - par->hooknum, sk, sk->sk_socket, - sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); - filp = sk->sk_socket ? sk->sk_socket->file : NULL; - MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", - par->hooknum, filp ? from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); - } - if (sk == NULL || sk->sk_socket == NULL) { + if (!sk) { /* * Here, the qtaguid_find_sk() using connection tracking * couldn't find the owner, so for now we just count them @@ -1732,9 +1722,7 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) */ if (do_tag_stat) account_for_uid(skb, sk, 0, par); - MT_DEBUG("qtaguid[%d]: leaving (sk?sk->sk_socket)=%p\n", - par->hooknum, - sk ? sk->sk_socket : NULL); + MT_DEBUG("qtaguid[%d]: leaving (sk=NULL)\n", par->hooknum); res = (info->match ^ info->invert) == 0; atomic64_inc(&qtu_events.match_no_sk); goto put_sock_ret_res; @@ -1742,19 +1730,10 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) res = false; goto put_sock_ret_res; } - filp = sk->sk_socket->file; - if (filp == NULL) { - MT_DEBUG("qtaguid[%d]: leaving filp=NULL\n", par->hooknum); - if (do_tag_stat) - account_for_uid(skb, sk, 0, par); - res = ((info->match ^ info->invert) & - (XT_QTAGUID_UID | XT_QTAGUID_GID)) == 0; - atomic64_inc(&qtu_events.match_no_sk_file); - goto put_sock_ret_res; - } - sock_uid = filp->f_cred->fsuid; + sock_uid = sk->sk_uid; if (do_tag_stat) - account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), par); + account_for_uid(skb, sk, from_kuid(&init_user_ns, sock_uid), + par); /* * The following two tests fail the match when: @@ -1766,8 +1745,8 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) kuid_t uid_min = make_kuid(&init_user_ns, info->uid_min); kuid_t uid_max = make_kuid(&init_user_ns, info->uid_max); - if ((uid_gte(filp->f_cred->fsuid, uid_min) && - uid_lte(filp->f_cred->fsuid, uid_max)) ^ + if ((uid_gte(sock_uid, uid_min) && + uid_lte(sock_uid, uid_max)) ^ !(info->invert & XT_QTAGUID_UID)) { MT_DEBUG("qtaguid[%d]: leaving uid not matching\n", par->hooknum); @@ -1778,7 +1757,21 @@ static bool qtaguid_mt(const struct sk_buff *skb, struct xt_action_param *par) if (info->match & XT_QTAGUID_GID) { kgid_t gid_min = make_kgid(&init_user_ns, info->gid_min); kgid_t gid_max = make_kgid(&init_user_ns, info->gid_max); - + set_sk_callback_lock = true; + read_lock_bh(&sk->sk_callback_lock); + MT_DEBUG("qtaguid[%d]: sk=%p->sk_socket=%p->file=%p\n", + par->hooknum, sk, sk->sk_socket, + sk->sk_socket ? sk->sk_socket->file : (void *)-1LL); + filp = sk->sk_socket ? sk->sk_socket->file : NULL; + if (!filp) { + res = ((info->match ^ info->invert) & + XT_QTAGUID_GID) == 0; + atomic64_inc(&qtu_events.match_no_sk_gid); + goto put_sock_ret_res; + } + MT_DEBUG("qtaguid[%d]: filp...uid=%u\n", + par->hooknum, filp ? + from_kuid(&init_user_ns, filp->f_cred->fsuid) : -1); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_QTAGUID_GID)) { @@ -1950,7 +1943,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) "match_found_sk_in_ct=%llu " "match_found_no_sk_in_ct=%llu " "match_no_sk=%llu " - "match_no_sk_file=%llu\n", + "match_no_sk_gid=%llu\n", (u64)atomic64_read(&qtu_events.sockets_tagged), (u64)atomic64_read(&qtu_events.sockets_untagged), (u64)atomic64_read(&qtu_events.counter_set_changes), @@ -1962,7 +1955,7 @@ static int qtaguid_ctrl_proc_show(struct seq_file *m, void *v) (u64)atomic64_read(&qtu_events.match_found_sk_in_ct), (u64)atomic64_read(&qtu_events.match_found_no_sk_in_ct), (u64)atomic64_read(&qtu_events.match_no_sk), - (u64)atomic64_read(&qtu_events.match_no_sk_file)); + (u64)atomic64_read(&qtu_events.match_no_sk_gid)); /* Count the following as part of the last item_index. No need * to lock the sock_tag_list here since it is already locked when diff --git a/net/netfilter/xt_qtaguid_internal.h b/net/netfilter/xt_qtaguid_internal.h index 8178fbdfb036..c7052707a6a4 100644 --- a/net/netfilter/xt_qtaguid_internal.h +++ b/net/netfilter/xt_qtaguid_internal.h @@ -289,10 +289,10 @@ struct qtaguid_event_counts { */ atomic64_t match_no_sk; /* - * The file ptr in the sk_socket wasn't there. + * The file ptr in the sk_socket wasn't there and we couldn't get GID. * This might happen for traffic while the socket is being closed. */ - atomic64_t match_no_sk_file; + atomic64_t match_no_sk_gid; }; /* Track the set active_set for the given tag. */ From 5758aee5051f4cda0d71f28623fe3e771a8b1c4a Mon Sep 17 00:00:00 2001 From: gaurav jindal Date: Fri, 8 Sep 2017 00:07:43 +0530 Subject: [PATCH 182/546] drivers: cpufreq: checks to avoid kernel crash in cpufreq_interactive In cpufreq_governor_interactive, driver throws warning with WARN_ON for !tunables and event != CPUFREQ_GOV_POLICY_INIT. In case when tunables is NULL for event other than CPUFREQ_GOV_POLICY_INIT, kernel will crash as there is no safe check available before accessing tunables. So to handle such case and avoid the kernel crash, return -EINVAL if WARN_ON returns TRUE. Change-Id: I7a3a22d58e3c8a315a1cc1d31143649dc8807dee Signed-off-by: gaurav jindal --- drivers/cpufreq/cpufreq_interactive.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 889c9b8b2237..52ef2ea6b65c 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -1148,7 +1148,8 @@ static int cpufreq_governor_interactive(struct cpufreq_policy *policy, else tunables = common_tunables; - WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT)); + if (WARN_ON(!tunables && (event != CPUFREQ_GOV_POLICY_INIT))) + return -EINVAL; switch (event) { case CPUFREQ_GOV_POLICY_INIT: From eca7e3d5662a9459f25e954e248fd03edb777170 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 6 Sep 2017 13:06:43 -0700 Subject: [PATCH 183/546] ANDROID: mnt: Fix freeing of mount data Fix double free on error paths Signed-off-by: Daniel Rosenberg Change-Id: I1c25a175e87e5dd5cafcdcf9d78bf4c0dc3f88ef Bug: 65386954 Fixes: 6b42d02561d3 ("ANDROID: mnt: Add filesystem private data to mount points") --- fs/namespace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index 15b91b36ecab..7e14bf1c851c 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -227,6 +227,7 @@ static struct mount *alloc_vfsmnt(const char *name) mnt->mnt_count = 1; mnt->mnt_writers = 0; #endif + mnt->mnt.data = NULL; INIT_HLIST_NODE(&mnt->mnt_hash); INIT_LIST_HEAD(&mnt->mnt_child); @@ -976,7 +977,6 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void if (!mnt) return ERR_PTR(-ENOMEM); - mnt->mnt.data = NULL; if (type->alloc_mnt_data) { mnt->mnt.data = type->alloc_mnt_data(); if (!mnt->mnt.data) { @@ -990,7 +990,6 @@ vfs_kern_mount(struct file_system_type *type, int flags, const char *name, void root = mount_fs(type, flags, name, &mnt->mnt, data); if (IS_ERR(root)) { - kfree(mnt->mnt.data); mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_CAST(root); @@ -1094,7 +1093,6 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, return mnt; out_free: - kfree(mnt->mnt.data); mnt_free_id(mnt); free_vfsmnt(mnt); return ERR_PTR(err); From c1f83c68577d7e9de5f66cee112a6d791a78eab1 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Wed, 19 Jul 2017 17:25:07 -0700 Subject: [PATCH 184/546] ANDROID: Sdcardfs: Move gid derivation under flag This moves the code to adjust the gid/uid of lower filesystem files under the mount flag derive_gid. Signed-off-by: Daniel Rosenberg Change-Id: I44eaad4ef67c7fcfda3b6ea3502afab94442610c Bug: 63245673 --- fs/sdcardfs/derived_perm.c | 3 +++ fs/sdcardfs/inode.c | 12 ++++++++---- fs/sdcardfs/main.c | 6 ++++++ fs/sdcardfs/sdcardfs.h | 1 + fs/sdcardfs/super.c | 2 ++ 5 files changed, 20 insertions(+), 4 deletions(-) diff --git a/fs/sdcardfs/derived_perm.c b/fs/sdcardfs/derived_perm.c index 85a60fb5ff39..e0b74dc77c5f 100644 --- a/fs/sdcardfs/derived_perm.c +++ b/fs/sdcardfs/derived_perm.c @@ -176,6 +176,9 @@ void fixup_lower_ownership(struct dentry *dentry, const char *name) gid_t gid = sbi->options.fs_low_gid; struct iattr newattrs; + if (!sbi->options.gid_derivation) + return; + info = SDCARDFS_I(d_inode(dentry)); info_d = info->data; perm = info_d->perm; diff --git a/fs/sdcardfs/inode.c b/fs/sdcardfs/inode.c index 103dc45a131f..85eb8ebb5372 100644 --- a/fs/sdcardfs/inode.c +++ b/fs/sdcardfs/inode.c @@ -34,10 +34,14 @@ const struct cred *override_fsids(struct sdcardfs_sb_info *sbi, if (!cred) return NULL; - if (data->under_obb) - uid = AID_MEDIA_OBB; - else - uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid); + if (sbi->options.gid_derivation) { + if (data->under_obb) + uid = AID_MEDIA_OBB; + else + uid = multiuser_get_uid(data->userid, sbi->options.fs_low_uid); + } else { + uid = sbi->options.fs_low_uid; + } cred->fsuid = make_kuid(&init_user_ns, uid); cred->fsgid = make_kgid(&init_user_ns, sbi->options.fs_low_gid); diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 80825b287836..1e73b3ace563 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -32,6 +32,7 @@ enum { Opt_multiuser, Opt_userid, Opt_reserved_mb, + Opt_gid_derivation, Opt_err, }; @@ -43,6 +44,7 @@ static const match_table_t sdcardfs_tokens = { {Opt_mask, "mask=%u"}, {Opt_userid, "userid=%d"}, {Opt_multiuser, "multiuser"}, + {Opt_gid_derivation, "derive_gid"}, {Opt_reserved_mb, "reserved_mb=%u"}, {Opt_err, NULL} }; @@ -64,6 +66,8 @@ static int parse_options(struct super_block *sb, char *options, int silent, vfsopts->gid = 0; /* by default, 0MB is reserved */ opts->reserved_mb = 0; + /* by default, gid derivation is off */ + opts->gid_derivation = false; *debug = 0; @@ -115,6 +119,8 @@ static int parse_options(struct super_block *sb, char *options, int silent, return 0; opts->reserved_mb = option; break; + case Opt_gid_derivation: + opts->gid_derivation = true; /* unknown option */ default: if (!silent) diff --git a/fs/sdcardfs/sdcardfs.h b/fs/sdcardfs/sdcardfs.h index b8624fd8b817..88b92b2f1872 100644 --- a/fs/sdcardfs/sdcardfs.h +++ b/fs/sdcardfs/sdcardfs.h @@ -219,6 +219,7 @@ struct sdcardfs_mount_options { gid_t fs_low_gid; userid_t fs_user_id; bool multiuser; + bool gid_derivation; unsigned int reserved_mb; }; diff --git a/fs/sdcardfs/super.c b/fs/sdcardfs/super.c index 7f4539b4b249..b89947d878e3 100644 --- a/fs/sdcardfs/super.c +++ b/fs/sdcardfs/super.c @@ -302,6 +302,8 @@ static int sdcardfs_show_options(struct vfsmount *mnt, struct seq_file *m, seq_printf(m, ",mask=%u", vfsopts->mask); if (opts->fs_user_id) seq_printf(m, ",userid=%u", opts->fs_user_id); + if (opts->gid_derivation) + seq_puts(m, ",derive_gid"); if (opts->reserved_mb != 0) seq_printf(m, ",reserved=%uMB", opts->reserved_mb); From eebd4781fb3bb7b47568d8747d97d269d2649ec9 Mon Sep 17 00:00:00 2001 From: Daniel Rosenberg Date: Fri, 8 Sep 2017 17:20:06 -0700 Subject: [PATCH 185/546] ANDROID: sdcardfs: Add missing break Signed-off-by: Daniel Rosenberg Bug: 63245673 Change-Id: I5fc596420301045895e5a9a7e297fd05434babf9 --- fs/sdcardfs/main.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/sdcardfs/main.c b/fs/sdcardfs/main.c index 1e73b3ace563..0a2b5167e9a2 100644 --- a/fs/sdcardfs/main.c +++ b/fs/sdcardfs/main.c @@ -121,6 +121,7 @@ static int parse_options(struct super_block *sb, char *options, int silent, break; case Opt_gid_derivation: opts->gid_derivation = true; + break; /* unknown option */ default: if (!silent) From 6eb7ae1223f75fe19de8e75df80ac78ab6b7c39d Mon Sep 17 00:00:00 2001 From: Stefano Brivio Date: Fri, 18 Aug 2017 14:40:53 +0200 Subject: [PATCH 186/546] ipv6: accept 64k - 1 packet length in ip6_find_1stfragopt() [ Upstream commit 3de33e1ba0506723ab25734e098cf280ecc34756 ] A packet length of exactly IPV6_MAXPLEN is allowed, we should refuse parsing options only if the size is 64KiB or more. While at it, remove one extra variable and one assignment which were also introduced by the commit that introduced the size check. Checking the sum 'offset + len' and only later adding 'len' to 'offset' doesn't provide any advantage over directly summing to 'offset' and checking it. Fixes: 6399f1fae4ec ("ipv6: avoid overflow of offset in ip6_find_1stfragopt") Signed-off-by: Stefano Brivio Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/output_core.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/net/ipv6/output_core.c b/net/ipv6/output_core.c index f9f02581c4ca..f99a04674419 100644 --- a/net/ipv6/output_core.c +++ b/net/ipv6/output_core.c @@ -86,7 +86,6 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) while (offset <= packet_len) { struct ipv6_opt_hdr *exthdr; - unsigned int len; switch (**nexthdr) { @@ -112,10 +111,9 @@ int ip6_find_1stfragopt(struct sk_buff *skb, u8 **nexthdr) exthdr = (struct ipv6_opt_hdr *)(skb_network_header(skb) + offset); - len = ipv6_optlen(exthdr); - if (len + offset >= IPV6_MAXPLEN) + offset += ipv6_optlen(exthdr); + if (offset > IPV6_MAXPLEN) return -EINVAL; - offset += len; *nexthdr = &exthdr->nexthdr; } From e51bf99be7cc95db97d9fa9031ab09ea037a5c7a Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Mon, 21 Aug 2017 09:47:10 -0700 Subject: [PATCH 187/546] ipv6: add rcu grace period before freeing fib6_node [ Upstream commit c5cff8561d2d0006e972bd114afd51f082fee77c ] We currently keep rt->rt6i_node pointing to the fib6_node for the route. And some functions make use of this pointer to dereference the fib6_node from rt structure, e.g. rt6_check(). However, as there is neither refcount nor rcu taken when dereferencing rt->rt6i_node, it could potentially cause crashes as rt->rt6i_node could be set to NULL by other CPUs when doing a route deletion. This patch introduces an rcu grace period before freeing fib6_node and makes sure the functions that dereference it takes rcu_read_lock(). Note: there is no "Fixes" tag because this bug was there in a very early stage. Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip6_fib.h | 30 +++++++++++++++++++++++++++++- net/ipv6/ip6_fib.c | 20 ++++++++++++++++---- net/ipv6/route.c | 14 +++++++++++--- 3 files changed, 56 insertions(+), 8 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index fb961a576abe..f57321248be2 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -68,6 +68,7 @@ struct fib6_node { __u16 fn_flags; int fn_sernum; struct rt6_info *rr_ptr; + struct rcu_head rcu; }; #ifndef CONFIG_IPV6_SUBTREES @@ -165,13 +166,40 @@ static inline void rt6_update_expires(struct rt6_info *rt0, int timeout) rt0->rt6i_flags |= RTF_EXPIRES; } +/* Function to safely get fn->sernum for passed in rt + * and store result in passed in cookie. + * Return true if we can get cookie safely + * Return false if not + */ +static inline bool rt6_get_cookie_safe(const struct rt6_info *rt, + u32 *cookie) +{ + struct fib6_node *fn; + bool status = false; + + rcu_read_lock(); + fn = rcu_dereference(rt->rt6i_node); + + if (fn) { + *cookie = fn->fn_sernum; + status = true; + } + + rcu_read_unlock(); + return status; +} + static inline u32 rt6_get_cookie(const struct rt6_info *rt) { + u32 cookie = 0; + if (rt->rt6i_flags & RTF_PCPU || (unlikely(rt->dst.flags & DST_NOCACHE) && rt->dst.from)) rt = (struct rt6_info *)(rt->dst.from); - return rt->rt6i_node ? rt->rt6i_node->fn_sernum : 0; + rt6_get_cookie_safe(rt, &cookie); + + return cookie; } static inline void ip6_rt_put(struct rt6_info *rt) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index aad8cdf15472..e766b5e3c61c 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -150,11 +150,23 @@ static struct fib6_node *node_alloc(void) return fn; } -static void node_free(struct fib6_node *fn) +static void node_free_immediate(struct fib6_node *fn) { kmem_cache_free(fib6_node_kmem, fn); } +static void node_free_rcu(struct rcu_head *head) +{ + struct fib6_node *fn = container_of(head, struct fib6_node, rcu); + + kmem_cache_free(fib6_node_kmem, fn); +} + +static void node_free(struct fib6_node *fn) +{ + call_rcu(&fn->rcu, node_free_rcu); +} + static void rt6_rcu_free(struct rt6_info *rt) { call_rcu(&rt->dst.rcu_head, dst_rcu_free); @@ -588,9 +600,9 @@ static struct fib6_node *fib6_add_1(struct fib6_node *root, if (!in || !ln) { if (in) - node_free(in); + node_free_immediate(in); if (ln) - node_free(ln); + node_free_immediate(ln); return ERR_PTR(-ENOMEM); } @@ -1015,7 +1027,7 @@ int fib6_add(struct fib6_node *root, struct rt6_info *rt, root, and then (in failure) stale node in main tree. */ - node_free(sfn); + node_free_immediate(sfn); err = PTR_ERR(sn); goto failure; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index ef335070e98a..0e91fbf67200 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1248,7 +1248,9 @@ static void rt6_dst_from_metrics_check(struct rt6_info *rt) static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie) { - if (!rt->rt6i_node || (rt->rt6i_node->fn_sernum != cookie)) + u32 rt_cookie; + + if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie) return NULL; if (rt6_check_expired(rt)) @@ -1316,8 +1318,14 @@ static void ip6_link_failure(struct sk_buff *skb) if (rt->rt6i_flags & RTF_CACHE) { dst_hold(&rt->dst); ip6_del_rt(rt); - } else if (rt->rt6i_node && (rt->rt6i_flags & RTF_DEFAULT)) { - rt->rt6i_node->fn_sernum = -1; + } else { + struct fib6_node *fn; + + rcu_read_lock(); + fn = rcu_dereference(rt->rt6i_node); + if (fn && (rt->rt6i_flags & RTF_DEFAULT)) + fn->fn_sernum = -1; + rcu_read_unlock(); } } } From 354d36b746c3fdde7397409ce79ca89a2da2fbce Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Fri, 25 Aug 2017 15:03:10 -0700 Subject: [PATCH 188/546] ipv6: fix sparse warning on rt6i_node [ Upstream commit 4e587ea71bf924f7dac621f1351653bd41e446cb ] Commit c5cff8561d2d adds rcu grace period before freeing fib6_node. This generates a new sparse warning on rt->rt6i_node related code: net/ipv6/route.c:1394:30: error: incompatible types in comparison expression (different address spaces) ./include/net/ip6_fib.h:187:14: error: incompatible types in comparison expression (different address spaces) This commit adds "__rcu" tag for rt6i_node and makes sure corresponding rcu API is used for it. After this fix, sparse no longer generates the above warning. Fixes: c5cff8561d2d ("ipv6: add rcu grace period before freeing fib6_node") Signed-off-by: Wei Wang Acked-by: Eric Dumazet Acked-by: Martin KaFai Lau Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/ip6_fib.h | 2 +- net/ipv6/addrconf.c | 2 +- net/ipv6/ip6_fib.c | 11 +++++++---- net/ipv6/route.c | 3 ++- 4 files changed, 11 insertions(+), 7 deletions(-) diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h index f57321248be2..fa5e703a14ed 100644 --- a/include/net/ip6_fib.h +++ b/include/net/ip6_fib.h @@ -103,7 +103,7 @@ struct rt6_info { * the same cache line. */ struct fib6_table *rt6i_table; - struct fib6_node *rt6i_node; + struct fib6_node __rcu *rt6i_node; struct in6_addr rt6i_gateway; diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c index 735b22b1b4ea..92174881844d 100644 --- a/net/ipv6/addrconf.c +++ b/net/ipv6/addrconf.c @@ -5152,7 +5152,7 @@ static void __ipv6_ifa_notify(int event, struct inet6_ifaddr *ifp) * our DAD process, so we don't need * to do it again */ - if (!(ifp->rt->rt6i_node)) + if (!rcu_access_pointer(ifp->rt->rt6i_node)) ip6_ins_rt(ifp->rt); if (ifp->idev->cnf.forwarding) addrconf_join_anycast(ifp); diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e766b5e3c61c..c14f6038a061 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -869,7 +869,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, rt->dst.rt6_next = iter; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); atomic_inc(&rt->rt6i_ref); inet6_rt_notify(RTM_NEWROUTE, rt, info, 0); info->nl_net->ipv6.rt6_stats->fib_rt_entries++; @@ -894,7 +894,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, return err; *ins = rt; - rt->rt6i_node = fn; + rcu_assign_pointer(rt->rt6i_node, fn); rt->dst.rt6_next = iter->dst.rt6_next; atomic_inc(&rt->rt6i_ref); inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE); @@ -1454,8 +1454,9 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp, int fib6_del(struct rt6_info *rt, struct nl_info *info) { + struct fib6_node *fn = rcu_dereference_protected(rt->rt6i_node, + lockdep_is_held(&rt->rt6i_table->tb6_lock)); struct net *net = info->nl_net; - struct fib6_node *fn = rt->rt6i_node; struct rt6_info **rtp; #if RT6_DEBUG >= 2 @@ -1644,7 +1645,9 @@ static int fib6_clean_node(struct fib6_walker *w) if (res) { #if RT6_DEBUG >= 2 pr_debug("%s: del failed: rt=%p@%p err=%d\n", - __func__, rt, rt->rt6i_node, res); + __func__, rt, + rcu_access_pointer(rt->rt6i_node), + res); #endif continue; } diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 0e91fbf67200..48917437550e 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -1342,7 +1342,8 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu) static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt) { return !(rt->rt6i_flags & RTF_CACHE) && - (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node); + (rt->rt6i_flags & RTF_PCPU || + rcu_access_pointer(rt->rt6i_node)); } static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk, From 6d8c8fd1c4c71fac291e7243becf2de11c82c216 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 23 Aug 2017 15:59:49 +0200 Subject: [PATCH 189/546] qlge: avoid memcpy buffer overflow [ Upstream commit e58f95831e7468d25eb6e41f234842ecfe6f014f ] gcc-8.0.0 (snapshot) points out that we copy a variable-length string into a fixed length field using memcpy() with the destination length, and that ends up copying whatever follows the string: inlined from 'ql_core_dump' at drivers/net/ethernet/qlogic/qlge/qlge_dbg.c:1106:2: drivers/net/ethernet/qlogic/qlge/qlge_dbg.c:708:2: error: 'memcpy' reading 15 bytes from a region of size 14 [-Werror=stringop-overflow=] memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); Changing it to use strncpy() will instead zero-pad the destination, which seems to be the right thing to do here. The bug is probably harmless, but it seems like a good idea to address it in stable kernels as well, if only for the purpose of building with gcc-8 without warnings. Fixes: a61f80261306 ("qlge: Add ethtool register dump function.") Signed-off-by: Arnd Bergmann Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/qlogic/qlge/qlge_dbg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c index 829be21f97b2..be258d90de9e 100644 --- a/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c +++ b/drivers/net/ethernet/qlogic/qlge/qlge_dbg.c @@ -724,7 +724,7 @@ static void ql_build_coredump_seg_header( seg_hdr->cookie = MPI_COREDUMP_COOKIE; seg_hdr->segNum = seg_number; seg_hdr->segSize = seg_size; - memcpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); + strncpy(seg_hdr->description, desc, (sizeof(seg_hdr->description)) - 1); } /* From 081be8c9efd6003e1aa78679b3265732de4cec9b Mon Sep 17 00:00:00 2001 From: Florian Fainelli Date: Wed, 30 Aug 2017 17:49:29 -0700 Subject: [PATCH 190/546] Revert "net: phy: Correctly process PHY_HALTED in phy_stop_machine()" [ Upstream commit ebc8254aeae34226d0bc8fda309fd9790d4dccfe ] This reverts commit 7ad813f208533cebfcc32d3d7474dc1677d1b09a ("net: phy: Correctly process PHY_HALTED in phy_stop_machine()") because it is creating the possibility for a NULL pointer dereference. David Daney provide the following call trace and diagram of events: When ndo_stop() is called we call: phy_disconnect() +---> phy_stop_interrupts() implies: phydev->irq = PHY_POLL; +---> phy_stop_machine() | +---> phy_state_machine() | +----> queue_delayed_work(): Work queued. +--->phy_detach() implies: phydev->attached_dev = NULL; Now at a later time the queued work does: phy_state_machine() +---->netif_carrier_off(phydev->attached_dev): Oh no! It is NULL: CPU 12 Unable to handle kernel paging request at virtual address 0000000000000048, epc == ffffffff80de37ec, ra == ffffffff80c7c Oops[#1]: CPU: 12 PID: 1502 Comm: kworker/12:1 Not tainted 4.9.43-Cavium-Octeon+ #1 Workqueue: events_power_efficient phy_state_machine task: 80000004021ed100 task.stack: 8000000409d70000 $ 0 : 0000000000000000 ffffffff84720060 0000000000000048 0000000000000004 $ 4 : 0000000000000000 0000000000000001 0000000000000004 0000000000000000 $ 8 : 0000000000000000 0000000000000000 00000000ffff98f3 0000000000000000 $12 : 8000000409d73fe0 0000000000009c00 ffffffff846547c8 000000000000af3b $16 : 80000004096bab68 80000004096babd0 0000000000000000 80000004096ba800 $20 : 0000000000000000 0000000000000000 ffffffff81090000 0000000000000008 $24 : 0000000000000061 ffffffff808637b0 $28 : 8000000409d70000 8000000409d73cf0 80000000271bd300 ffffffff80c7804c Hi : 000000000000002a Lo : 000000000000003f epc : ffffffff80de37ec netif_carrier_off+0xc/0x58 ra : ffffffff80c7804c phy_state_machine+0x48c/0x4f8 Status: 14009ce3 KX SX UX KERNEL EXL IE Cause : 00800008 (ExcCode 02) BadVA : 0000000000000048 PrId : 000d9501 (Cavium Octeon III) Modules linked in: Process kworker/12:1 (pid: 1502, threadinfo=8000000409d70000, task=80000004021ed100, tls=0000000000000000) Stack : 8000000409a54000 80000004096bab68 80000000271bd300 80000000271c1e00 0000000000000000 ffffffff808a1708 8000000409a54000 80000000271bd300 80000000271bd320 8000000409a54030 ffffffff80ff0f00 0000000000000001 ffffffff81090000 ffffffff808a1ac0 8000000402182080 ffffffff84650000 8000000402182080 ffffffff84650000 ffffffff80ff0000 8000000409a54000 ffffffff808a1970 0000000000000000 80000004099e8000 8000000402099240 0000000000000000 ffffffff808a8598 0000000000000000 8000000408eeeb00 8000000409a54000 00000000810a1d00 0000000000000000 8000000409d73de8 8000000409d73de8 0000000000000088 000000000c009c00 8000000409d73e08 8000000409d73e08 8000000402182080 ffffffff808a84d0 8000000402182080 ... Call Trace: [] netif_carrier_off+0xc/0x58 [] phy_state_machine+0x48c/0x4f8 [] process_one_work+0x158/0x368 [] worker_thread+0x150/0x4c0 [] kthread+0xc8/0xe0 [] ret_from_kernel_thread+0x14/0x1c The original motivation for this change originated from Marc Gonzales indicating that his network driver did not have its adjust_link callback executing with phydev->link = 0 while he was expecting it. PHYLIB has never made any such guarantees ever because phy_stop() merely just tells the workqueue to move into PHY_HALTED state which will happen asynchronously. Reported-by: Geert Uytterhoeven Reported-by: David Daney Fixes: 7ad813f20853 ("net: phy: Correctly process PHY_HALTED in phy_stop_machine()") Signed-off-by: Florian Fainelli Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/phy/phy.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/net/phy/phy.c b/drivers/net/phy/phy.c index 49d9f0a789fe..7d0690433ee0 100644 --- a/drivers/net/phy/phy.c +++ b/drivers/net/phy/phy.c @@ -541,9 +541,6 @@ void phy_stop_machine(struct phy_device *phydev) if (phydev->state > PHY_UP && phydev->state != PHY_HALTED) phydev->state = PHY_UP; mutex_unlock(&phydev->lock); - - /* Now we can run the state machine synchronously */ - phy_state_machine(&phydev->state_queue.work); } /** From 611a98c8eca3098173309642df187056c17e0f65 Mon Sep 17 00:00:00 2001 From: Wei Wang Date: Thu, 18 May 2017 11:22:33 -0700 Subject: [PATCH 191/546] tcp: initialize rcv_mss to TCP_MIN_MSS instead of 0 [ Upstream commit 499350a5a6e7512d9ed369ed63a4244b6536f4f8 ] When tcp_disconnect() is called, inet_csk_delack_init() sets icsk->icsk_ack.rcv_mss to 0. This could potentially cause tcp_recvmsg() => tcp_cleanup_rbuf() => __tcp_select_window() call path to have division by 0 issue. So this patch initializes rcv_mss to TCP_MIN_MSS instead of 0. Reported-by: Andrey Konovalov Signed-off-by: Wei Wang Signed-off-by: Eric Dumazet Signed-off-by: Neal Cardwell Signed-off-by: Yuchung Cheng Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv4/tcp.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c index 0870a86e9d96..5597120c8ffd 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c @@ -2260,6 +2260,10 @@ int tcp_disconnect(struct sock *sk, int flags) tcp_set_ca_state(sk, TCP_CA_Open); tcp_clear_retrans(tp); inet_csk_delack_init(sk); + /* Initialize rcv_mss to TCP_MIN_MSS to avoid division by 0 + * issue in __tcp_select_window() + */ + icsk->icsk_ack.rcv_mss = TCP_MIN_MSS; tcp_init_send_head(sk); memset(&tp->rx_opt, 0, sizeof(tp->rx_opt)); __sk_dst_reset(sk); From 40bc5355e134af1d0ac05fe0dcb0aa55f9144bb4 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 1 Sep 2017 11:26:08 +0200 Subject: [PATCH 192/546] Revert "net: use lib/percpu_counter API for fragmentation mem accounting" [ Upstream commit fb452a1aa3fd4034d7999e309c5466ff2d7005aa ] This reverts commit 6d7b857d541ecd1d9bd997c97242d4ef94b19de2. There is a bug in fragmentation codes use of the percpu_counter API, that can cause issues on systems with many CPUs. The frag_mem_limit() just reads the global counter (fbc->count), without considering other CPUs can have upto batch size (130K) that haven't been subtracted yet. Due to the 3MBytes lower thresh limit, this become dangerous at >=24 CPUs (3*1024*1024/130000=24). The correct API usage would be to use __percpu_counter_compare() which does the right thing, and takes into account the number of (online) CPUs and batch size, to account for this and call __percpu_counter_sum() when needed. We choose to revert the use of the lib/percpu_counter API for frag memory accounting for several reasons: 1) On systems with CPUs > 24, the heavier fully locked __percpu_counter_sum() is always invoked, which will be more expensive than the atomic_t that is reverted to. Given systems with more than 24 CPUs are becoming common this doesn't seem like a good option. To mitigate this, the batch size could be decreased and thresh be increased. 2) The add_frag_mem_limit+sub_frag_mem_limit pairs happen on the RX CPU, before SKBs are pushed into sockets on remote CPUs. Given NICs can only hash on L2 part of the IP-header, the NIC-RXq's will likely be limited. Thus, a fair chance that atomic add+dec happen on the same CPU. Revert note that commit 1d6119baf061 ("net: fix percpu memory leaks") removed init_frag_mem_limit() and instead use inet_frags_init_net(). After this revert, inet_frags_uninit_net() becomes empty. Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting") Fixes: 1d6119baf061 ("net: fix percpu memory leaks") Signed-off-by: Jesper Dangaard Brouer Acked-by: Florian Westphal Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/inet_frag.h | 36 +++++++++--------------------------- net/ipv4/inet_fragment.c | 4 +--- 2 files changed, 10 insertions(+), 30 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index ac42bbb37b2d..0cc1e287e683 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -1,14 +1,9 @@ #ifndef __NET_FRAG_H__ #define __NET_FRAG_H__ -#include - struct netns_frags { - /* The percpu_counter "mem" need to be cacheline aligned. - * mem.count must not share cacheline with other writers - */ - struct percpu_counter mem ____cacheline_aligned_in_smp; - + /* Keep atomic mem on separate cachelines in structs that include it */ + atomic_t mem ____cacheline_aligned_in_smp; /* sysctls */ int timeout; int high_thresh; @@ -110,11 +105,11 @@ void inet_frags_fini(struct inet_frags *); static inline int inet_frags_init_net(struct netns_frags *nf) { - return percpu_counter_init(&nf->mem, 0, GFP_KERNEL); + atomic_set(&nf->mem, 0); + return 0; } static inline void inet_frags_uninit_net(struct netns_frags *nf) { - percpu_counter_destroy(&nf->mem); } void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); @@ -140,37 +135,24 @@ static inline bool inet_frag_evicting(struct inet_frag_queue *q) /* Memory Tracking Functions. */ -/* The default percpu_counter batch size is not big enough to scale to - * fragmentation mem acct sizes. - * The mem size of a 64K fragment is approx: - * (44 fragments * 2944 truesize) + frag_queue struct(200) = 129736 bytes - */ -static unsigned int frag_percpu_counter_batch = 130000; - static inline int frag_mem_limit(struct netns_frags *nf) { - return percpu_counter_read(&nf->mem); + return atomic_read(&nf->mem); } static inline void sub_frag_mem_limit(struct netns_frags *nf, int i) { - __percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch); + atomic_sub(i, &nf->mem); } static inline void add_frag_mem_limit(struct netns_frags *nf, int i) { - __percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch); + atomic_add(i, &nf->mem); } -static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf) +static inline int sum_frag_mem_limit(struct netns_frags *nf) { - unsigned int res; - - local_bh_disable(); - res = percpu_counter_sum_positive(&nf->mem); - local_bh_enable(); - - return res; + return atomic_read(&nf->mem); } /* RFC 3168 support : diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c index fe144dae7372..c5fb2f694ed0 100644 --- a/net/ipv4/inet_fragment.c +++ b/net/ipv4/inet_fragment.c @@ -234,10 +234,8 @@ void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f) cond_resched(); if (read_seqretry(&f->rnd_seqlock, seq) || - percpu_counter_sum(&nf->mem)) + sum_frag_mem_limit(nf)) goto evict_again; - - percpu_counter_destroy(&nf->mem); } EXPORT_SYMBOL(inet_frags_exit_net); From 5f529e0d78447e03a3acf125883a3f7826817c01 Mon Sep 17 00:00:00 2001 From: Jesper Dangaard Brouer Date: Fri, 1 Sep 2017 11:26:13 +0200 Subject: [PATCH 193/546] Revert "net: fix percpu memory leaks" [ Upstream commit 5a63643e583b6a9789d7a225ae076fb4e603991c ] This reverts commit 1d6119baf0610f813eb9d9580eb4fd16de5b4ceb. After reverting commit 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting") then here is no need for this fix-up patch. As percpu_counter is no longer used, it cannot memory leak it any-longer. Fixes: 6d7b857d541e ("net: use lib/percpu_counter API for fragmentation mem accounting") Fixes: 1d6119baf061 ("net: fix percpu memory leaks") Signed-off-by: Jesper Dangaard Brouer Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- include/net/inet_frag.h | 7 +------ net/ieee802154/6lowpan/reassembly.c | 11 +++-------- net/ipv4/ip_fragment.c | 12 +++--------- net/ipv6/netfilter/nf_conntrack_reasm.c | 12 +++--------- net/ipv6/reassembly.c | 12 +++--------- 5 files changed, 13 insertions(+), 41 deletions(-) diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h index 0cc1e287e683..c26a6e4dc306 100644 --- a/include/net/inet_frag.h +++ b/include/net/inet_frag.h @@ -103,15 +103,10 @@ struct inet_frags { int inet_frags_init(struct inet_frags *); void inet_frags_fini(struct inet_frags *); -static inline int inet_frags_init_net(struct netns_frags *nf) +static inline void inet_frags_init_net(struct netns_frags *nf) { atomic_set(&nf->mem, 0); - return 0; } -static inline void inet_frags_uninit_net(struct netns_frags *nf) -{ -} - void inet_frags_exit_net(struct netns_frags *nf, struct inet_frags *f); void inet_frag_kill(struct inet_frag_queue *q, struct inet_frags *f); diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c index 6b437e8760d3..12e8cf4bda9f 100644 --- a/net/ieee802154/6lowpan/reassembly.c +++ b/net/ieee802154/6lowpan/reassembly.c @@ -580,19 +580,14 @@ static int __net_init lowpan_frags_init_net(struct net *net) { struct netns_ieee802154_lowpan *ieee802154_lowpan = net_ieee802154_lowpan(net); - int res; ieee802154_lowpan->frags.high_thresh = IPV6_FRAG_HIGH_THRESH; ieee802154_lowpan->frags.low_thresh = IPV6_FRAG_LOW_THRESH; ieee802154_lowpan->frags.timeout = IPV6_FRAG_TIMEOUT; - res = inet_frags_init_net(&ieee802154_lowpan->frags); - if (res) - return res; - res = lowpan_frags_ns_sysctl_register(net); - if (res) - inet_frags_uninit_net(&ieee802154_lowpan->frags); - return res; + inet_frags_init_net(&ieee802154_lowpan->frags); + + return lowpan_frags_ns_sysctl_register(net); } static void __net_exit lowpan_frags_exit_net(struct net *net) diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c index b8a0607dab96..e2e162432aa3 100644 --- a/net/ipv4/ip_fragment.c +++ b/net/ipv4/ip_fragment.c @@ -840,8 +840,6 @@ static void __init ip4_frags_ctl_register(void) static int __net_init ipv4_frags_init_net(struct net *net) { - int res; - /* Fragment cache limits. * * The fragment memory accounting code, (tries to) account for @@ -865,13 +863,9 @@ static int __net_init ipv4_frags_init_net(struct net *net) */ net->ipv4.frags.timeout = IP_FRAG_TIME; - res = inet_frags_init_net(&net->ipv4.frags); - if (res) - return res; - res = ip4_frags_ns_ctl_register(net); - if (res) - inet_frags_uninit_net(&net->ipv4.frags); - return res; + inet_frags_init_net(&net->ipv4.frags); + + return ip4_frags_ns_ctl_register(net); } static void __net_exit ipv4_frags_exit_net(struct net *net) diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c index bab4441ed4e4..eb2dc39f7066 100644 --- a/net/ipv6/netfilter/nf_conntrack_reasm.c +++ b/net/ipv6/netfilter/nf_conntrack_reasm.c @@ -649,18 +649,12 @@ EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig); static int nf_ct_net_init(struct net *net) { - int res; - net->nf_frag.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->nf_frag.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->nf_frag.frags.timeout = IPV6_FRAG_TIMEOUT; - res = inet_frags_init_net(&net->nf_frag.frags); - if (res) - return res; - res = nf_ct_frag6_sysctl_register(net); - if (res) - inet_frags_uninit_net(&net->nf_frag.frags); - return res; + inet_frags_init_net(&net->nf_frag.frags); + + return nf_ct_frag6_sysctl_register(net); } static void nf_ct_net_exit(struct net *net) diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c index a234552a7e3d..58f2139ebb5e 100644 --- a/net/ipv6/reassembly.c +++ b/net/ipv6/reassembly.c @@ -708,19 +708,13 @@ static void ip6_frags_sysctl_unregister(void) static int __net_init ipv6_frags_init_net(struct net *net) { - int res; - net->ipv6.frags.high_thresh = IPV6_FRAG_HIGH_THRESH; net->ipv6.frags.low_thresh = IPV6_FRAG_LOW_THRESH; net->ipv6.frags.timeout = IPV6_FRAG_TIMEOUT; - res = inet_frags_init_net(&net->ipv6.frags); - if (res) - return res; - res = ip6_frags_ns_sysctl_register(net); - if (res) - inet_frags_uninit_net(&net->ipv6.frags); - return res; + inet_frags_init_net(&net->ipv6.frags); + + return ip6_frags_ns_sysctl_register(net); } static void __net_exit ipv6_frags_exit_net(struct net *net) From 9b5e5d8a0045ca7c5dd195cba803eac6de6f589f Mon Sep 17 00:00:00 2001 From: Claudiu Manoil Date: Mon, 4 Sep 2017 10:45:28 +0300 Subject: [PATCH 194/546] gianfar: Fix Tx flow control deactivation [ Upstream commit 5d621672bc1a1e5090c1ac5432a18c79e0e13e03 ] The wrong register is checked for the Tx flow control bit, it should have been maccfg1 not maccfg2. This went unnoticed for so long probably because the impact is hardly visible, not to mention the tangled code from adjust_link(). First, link flow control (i.e. handling of Rx/Tx link level pause frames) is disabled by default (needs to be enabled via 'ethtool -A'). Secondly, maccfg2 always returns 0 for tx_flow_oldval (except for a few old boards), which results in Tx flow control remaining always on once activated. Fixes: 45b679c9a3ccd9e34f28e6ec677b812a860eb8eb ("gianfar: Implement PAUSE frame generation support") Signed-off-by: Claudiu Manoil Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/freescale/gianfar.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/freescale/gianfar.c b/drivers/net/ethernet/freescale/gianfar.c index 4cd2a7d0124f..7923bfdc9b30 100644 --- a/drivers/net/ethernet/freescale/gianfar.c +++ b/drivers/net/ethernet/freescale/gianfar.c @@ -3676,7 +3676,7 @@ static noinline void gfar_update_link_state(struct gfar_private *priv) u32 tempval1 = gfar_read(®s->maccfg1); u32 tempval = gfar_read(®s->maccfg2); u32 ecntrl = gfar_read(®s->ecntrl); - u32 tx_flow_oldval = (tempval & MACCFG1_TX_FLOW); + u32 tx_flow_oldval = (tempval1 & MACCFG1_TX_FLOW); if (phydev->duplex != priv->oldduplex) { if (!(phydev->duplex)) From 70479eafe3d974c60a71718530a46f8ad3ce9c3f Mon Sep 17 00:00:00 2001 From: Sabrina Dubroca Date: Fri, 8 Sep 2017 10:26:19 +0200 Subject: [PATCH 195/546] ipv6: fix memory leak with multiple tables during netns destruction [ Upstream commit ba1cc08d9488c94cb8d94f545305688b72a2a300 ] fib6_net_exit only frees the main and local tables. If another table was created with fib6_alloc_table, we leak it when the netns is destroyed. Fix this in the same way ip_fib_net_exit cleans up tables, by walking through the whole hashtable of fib6_table's. We can get rid of the special cases for local and main, since they're also part of the hashtable. Reproducer: ip netns add x ip -net x -6 rule add from 6003:1::/64 table 100 ip netns del x Reported-by: Jianlin Shi Fixes: 58f09b78b730 ("[NETNS][IPV6] ip6_fib - make it per network namespace") Signed-off-by: Sabrina Dubroca Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_fib.c | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c14f6038a061..e03043dc1d78 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -203,6 +203,12 @@ static void rt6_release(struct rt6_info *rt) } } +static void fib6_free_table(struct fib6_table *table) +{ + inetpeer_invalidate_tree(&table->tb6_peers); + kfree(table); +} + static void fib6_link_table(struct net *net, struct fib6_table *tb) { unsigned int h; @@ -1885,15 +1891,22 @@ static int __net_init fib6_net_init(struct net *net) static void fib6_net_exit(struct net *net) { + unsigned int i; + rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); -#ifdef CONFIG_IPV6_MULTIPLE_TABLES - inetpeer_invalidate_tree(&net->ipv6.fib6_local_tbl->tb6_peers); - kfree(net->ipv6.fib6_local_tbl); -#endif - inetpeer_invalidate_tree(&net->ipv6.fib6_main_tbl->tb6_peers); - kfree(net->ipv6.fib6_main_tbl); + for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + struct hlist_head *head = &net->ipv6.fib_table_hash[i]; + struct hlist_node *tmp; + struct fib6_table *tb; + + hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) { + hlist_del(&tb->tb6_hlist); + fib6_free_table(tb); + } + } + kfree(net->ipv6.fib_table_hash); kfree(net->ipv6.rt6_stats); } From be9994817ad5717f64e07c19e5ec2f6b29aad4d0 Mon Sep 17 00:00:00 2001 From: Eric Dumazet Date: Fri, 8 Sep 2017 15:48:47 -0700 Subject: [PATCH 196/546] ipv6: fix typo in fib6_net_exit() [ Upstream commit 32a805baf0fb70b6dbedefcd7249ac7f580f9e3b ] IPv6 FIB should use FIB6_TABLE_HASHSZ, not FIB_TABLE_HASHSZ. Fixes: ba1cc08d9488 ("ipv6: fix memory leak with multiple tables during netns destruction") Signed-off-by: Eric Dumazet Signed-off-by: David S. Miller Signed-off-by: Greg Kroah-Hartman --- net/ipv6/ip6_fib.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index e03043dc1d78..c23e02a7ccb0 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -1896,7 +1896,7 @@ static void fib6_net_exit(struct net *net) rt6_ifdown(net, NULL); del_timer_sync(&net->ipv6.ip6_fib_timer); - for (i = 0; i < FIB_TABLE_HASHSZ; i++) { + for (i = 0; i < FIB6_TABLE_HASHSZ; i++) { struct hlist_head *head = &net->ipv6.fib_table_hash[i]; struct hlist_node *tmp; struct fib6_table *tb; From 53e5f7b8d41bb4af0666fe1e7887c13754b10094 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Sat, 12 Aug 2017 21:33:23 -0700 Subject: [PATCH 197/546] f2fs: check hot_data for roll-forward recovery commit 125c9fb1ccb53eb2ea9380df40f3c743f3fb2fed upstream. We need to check HOT_DATA to truncate any previous data block when doing roll-forward recovery. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim Signed-off-by: Greg Kroah-Hartman --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index cbf74f47cce8..e32f349f341b 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -276,7 +276,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; /* Get the previous summary */ - for (i = CURSEG_WARM_DATA; i <= CURSEG_COLD_DATA; i++) { + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; From e21d66048d4db2206c12344af07a934fd68418e4 Mon Sep 17 00:00:00 2001 From: Andy Lutomirski Date: Tue, 1 Aug 2017 07:11:35 -0700 Subject: [PATCH 198/546] x86/fsgsbase/64: Report FSBASE and GSBASE correctly in core dumps commit 9584d98bed7a7a904d0702ad06bbcc94703cb5b4 upstream. In ELF_COPY_CORE_REGS, we're copying from the current task, so accessing thread.fsbase and thread.gsbase makes no sense. Just read the values from the CPU registers. In practice, the old code would have been correct most of the time simply because thread.fsbase and thread.gsbase usually matched the CPU registers. Signed-off-by: Andy Lutomirski Cc: Borislav Petkov Cc: Borislav Petkov Cc: Brian Gerst Cc: Chang Seok Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Josh Poimboeuf Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/elf.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h index bcd3d6199464..bb16a58cf7e4 100644 --- a/arch/x86/include/asm/elf.h +++ b/arch/x86/include/asm/elf.h @@ -204,6 +204,7 @@ void set_personality_ia32(bool); #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ do { \ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fs; \ - (pr_reg)[22] = current->thread.gs; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ From d5c59ee8482042a0c63fa033c043989d00582525 Mon Sep 17 00:00:00 2001 From: Song Liu Date: Thu, 24 Aug 2017 09:53:59 -0700 Subject: [PATCH 199/546] md/raid5: release/flush io in raid5_do_work() commit 9c72a18e46ebe0f09484cce8ebf847abdab58498 upstream. In raid5, there are scenarios where some ios are deferred to a later time, and some IO need a flush to complete. To make sure we make progress with these IOs, we need to call the following functions: flush_deferred_bios(conf); r5l_flush_stripe_to_raid(conf->log); Both of these functions are called in raid5d(), but missing in raid5_do_work(). As a result, these functions are not called when multi-threading (group_thread_cnt > 0) is enabled. This patch adds calls to these function to raid5_do_work(). Note for stable branches: r5l_flush_stripe_to_raid(conf->log) is need for 4.4+ flush_deferred_bios(conf) is only needed for 4.11+ Signed-off-by: Song Liu Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid5.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 8f60520c8392..5eac08ffc697 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -5822,6 +5822,8 @@ static void raid5_do_work(struct work_struct *work) spin_unlock_irq(&conf->device_lock); + r5l_flush_stripe_to_raid(conf->log); + async_tx_issue_pending_all(); blk_finish_plug(&plug); From 6ea627b20205fcf7e8191b28f9207c97a69bf58f Mon Sep 17 00:00:00 2001 From: Chuck Lever Date: Sat, 29 Oct 2016 18:19:03 -0400 Subject: [PATCH 200/546] nfsd: Fix general protection fault in release_lock_stateid() commit f46c445b79906a9da55c13e0a6f6b6a006b892fe upstream. When I push NFSv4.1 / RDMA hard, (xfstests generic/089, for example), I get this crash on the server: Oct 28 22:04:30 klimt kernel: general protection fault: 0000 [#1] SMP DEBUG_PAGEALLOC Oct 28 22:04:30 klimt kernel: Modules linked in: cts rpcsec_gss_krb5 iTCO_wdt iTCO_vendor_support sb_edac edac_core x86_pkg_temp_thermal intel_powerclamp coretemp kvm_intel kvm btrfs irqbypass crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel lrw gf128mul glue_helper ablk_helper cryptd xor pcspkr raid6_pq i2c_i801 i2c_smbus lpc_ich mfd_core sg mei_me mei ioatdma shpchp wmi ipmi_si ipmi_msghandler rpcrdma ib_ipoib rdma_ucm acpi_power_meter acpi_pad ib_ucm ib_uverbs ib_umad rdma_cm ib_cm iw_cm nfsd auth_rpcgss nfs_acl lockd grace sunrpc ip_tables xfs libcrc32c mlx4_ib mlx4_en ib_core sr_mod cdrom sd_mod ast drm_kms_helper syscopyarea sysfillrect sysimgblt fb_sys_fops ttm drm crc32c_intel igb ahci libahci ptp mlx4_core pps_core dca libata i2c_algo_bit i2c_core dm_mirror dm_region_hash dm_log dm_mod Oct 28 22:04:30 klimt kernel: CPU: 7 PID: 1558 Comm: nfsd Not tainted 4.9.0-rc2-00005-g82cd754 #8 Oct 28 22:04:30 klimt kernel: Hardware name: Supermicro Super Server/X10SRL-F, BIOS 1.0c 09/09/2015 Oct 28 22:04:30 klimt kernel: task: ffff880835c3a100 task.stack: ffff8808420d8000 Oct 28 22:04:30 klimt kernel: RIP: 0010:[] [] release_lock_stateid+0x1f/0x60 [nfsd] Oct 28 22:04:30 klimt kernel: RSP: 0018:ffff8808420dbce0 EFLAGS: 00010246 Oct 28 22:04:30 klimt kernel: RAX: ffff88084e6660f0 RBX: ffff88084e667020 RCX: 0000000000000000 Oct 28 22:04:30 klimt kernel: RDX: 0000000000000007 RSI: 0000000000000000 RDI: ffff88084e667020 Oct 28 22:04:30 klimt kernel: RBP: ffff8808420dbcf8 R08: 0000000000000001 R09: 0000000000000000 Oct 28 22:04:30 klimt kernel: R10: ffff880835c3a100 R11: ffff880835c3aca8 R12: 6b6b6b6b6b6b6b6b Oct 28 22:04:30 klimt kernel: R13: ffff88084e6670d8 R14: ffff880835f546f0 R15: ffff880835f1c548 Oct 28 22:04:30 klimt kernel: FS: 0000000000000000(0000) GS:ffff88087bdc0000(0000) knlGS:0000000000000000 Oct 28 22:04:30 klimt kernel: CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 Oct 28 22:04:30 klimt kernel: CR2: 00007ff020389000 CR3: 0000000001c06000 CR4: 00000000001406e0 Oct 28 22:04:30 klimt kernel: Stack: Oct 28 22:04:30 klimt kernel: ffff88084e667020 0000000000000000 ffff88084e6670d8 ffff8808420dbd20 Oct 28 22:04:30 klimt kernel: ffffffffa05ac80d ffff880835f54548 ffff88084e640008 ffff880835f545b0 Oct 28 22:04:30 klimt kernel: ffff8808420dbd70 ffffffffa059803d ffff880835f1c768 0000000000000870 Oct 28 22:04:30 klimt kernel: Call Trace: Oct 28 22:04:30 klimt kernel: [] nfsd4_free_stateid+0xfd/0x1b0 [nfsd] Oct 28 22:04:30 klimt kernel: [] nfsd4_proc_compound+0x40d/0x690 [nfsd] Oct 28 22:04:30 klimt kernel: [] nfsd_dispatch+0xd4/0x1d0 [nfsd] Oct 28 22:04:30 klimt kernel: [] svc_process_common+0x3d9/0x700 [sunrpc] Oct 28 22:04:30 klimt kernel: [] svc_process+0xf4/0x330 [sunrpc] Oct 28 22:04:30 klimt kernel: [] nfsd+0xfa/0x160 [nfsd] Oct 28 22:04:30 klimt kernel: [] ? nfsd_destroy+0x170/0x170 [nfsd] Oct 28 22:04:30 klimt kernel: [] kthread+0x10b/0x120 Oct 28 22:04:30 klimt kernel: [] ? kthread_stop+0x280/0x280 Oct 28 22:04:30 klimt kernel: [] ret_from_fork+0x2a/0x40 Oct 28 22:04:30 klimt kernel: Code: c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 55 48 89 e5 41 55 41 54 53 48 8b 87 b0 00 00 00 48 89 fb 4c 8b a0 98 00 00 00 <49> 8b 44 24 20 48 8d b8 80 03 00 00 e8 10 66 1a e1 48 89 df e8 Oct 28 22:04:30 klimt kernel: RIP [] release_lock_stateid+0x1f/0x60 [nfsd] Oct 28 22:04:30 klimt kernel: RSP Oct 28 22:04:30 klimt kernel: ---[ end trace cf5d0b371973e167 ]--- Jeff Layton says: > Hm...now that I look though, this is a little suspicious: > > struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); > > I wonder if it's possible for the openstateid to have already been > destroyed at this point. > > We might be better off doing something like this to get the client pointer: > > stp->st_stid.sc_client; > > ...which should be more direct and less dependent on other stateids > staying valid. With the suggested change, I am no longer able to reproduce the above oops. v2: Fix unhash_lock_stateid() as well Fix-suggested-by: Jeff Layton Fixes: 42691398be08 ('nfsd: Fix race between FREE_STATEID and LOCK') Signed-off-by: Chuck Lever Reviewed-by: Jeff Layton Cc: Christian Theune Signed-off-by: J. Bruce Fields Signed-off-by: Greg Kroah-Hartman --- fs/nfsd/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index c7f1ce41442a..9e5a6842346e 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1145,9 +1145,7 @@ static void put_ol_stateid_locked(struct nfs4_ol_stateid *stp, static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); - - lockdep_assert_held(&oo->oo_owner.so_client->cl_lock); + lockdep_assert_held(&stp->st_stid.sc_client->cl_lock); list_del_init(&stp->st_locks); nfs4_unhash_stid(&stp->st_stid); @@ -1156,12 +1154,12 @@ static bool unhash_lock_stateid(struct nfs4_ol_stateid *stp) static void release_lock_stateid(struct nfs4_ol_stateid *stp) { - struct nfs4_openowner *oo = openowner(stp->st_openstp->st_stateowner); + struct nfs4_client *clp = stp->st_stid.sc_client; bool unhashed; - spin_lock(&oo->oo_owner.so_client->cl_lock); + spin_lock(&clp->cl_lock); unhashed = unhash_lock_stateid(stp); - spin_unlock(&oo->oo_owner.so_client->cl_lock); + spin_unlock(&clp->cl_lock); if (unhashed) nfs4_put_stid(&stp->st_stid); } From c576160ff3f31c1dd9536188520a41fdf8b5fc95 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Mon, 12 Dec 2016 16:42:08 -0800 Subject: [PATCH 201/546] mm: prevent double decrease of nr_reserved_highatomic commit 4855e4a7f29d6d10b0b9c84e189c770c9a94e91e upstream. There is race between page freeing and unreserved highatomic. CPU 0 CPU 1 free_hot_cold_page mt = get_pfnblock_migratetype set_pcppage_migratetype(page, mt) unreserve_highatomic_pageblock spin_lock_irqsave(&zone->lock) move_freepages_block set_pageblock_migratetype(page) spin_unlock_irqrestore(&zone->lock) free_pcppages_bulk __free_one_page(mt) <- mt is stale By above race, a page on CPU 0 could go non-highorderatomic free list since the pageblock's type is changed. By that, unreserve logic of highorderatomic can decrease reserved count on a same pageblock severak times and then it will make mismatch between nr_reserved_highatomic and the number of reserved pageblock. So, this patch verifies whether the pageblock is highatomic or not and decrease the count only if the pageblock is highatomic. Link: http://lkml.kernel.org/r/1476259429-18279-3-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Acked-by: Vlastimil Babka Acked-by: Mel Gorman Cc: Joonsoo Kim Cc: Sangseok Lee Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Cc: Miles Chen Signed-off-by: Greg Kroah-Hartman --- mm/page_alloc.c | 24 ++++++++++++++++++------ 1 file changed, 18 insertions(+), 6 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 53286b2f5b1c..6b5421ae86c6 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1748,13 +1748,25 @@ static void unreserve_highatomic_pageblock(const struct alloc_context *ac) struct page, lru); /* - * It should never happen but changes to locking could - * inadvertently allow a per-cpu drain to add pages - * to MIGRATE_HIGHATOMIC while unreserving so be safe - * and watch for underflows. + * In page freeing path, migratetype change is racy so + * we can counter several free pages in a pageblock + * in this loop althoug we changed the pageblock type + * from highatomic to ac->migratetype. So we should + * adjust the count once. */ - zone->nr_reserved_highatomic -= min(pageblock_nr_pages, - zone->nr_reserved_highatomic); + if (get_pageblock_migratetype(page) == + MIGRATE_HIGHATOMIC) { + /* + * It should never happen but changes to + * locking could inadvertently allow a per-cpu + * drain to add pages to MIGRATE_HIGHATOMIC + * while unreserving so be safe and watch for + * underflows. + */ + zone->nr_reserved_highatomic -= min( + pageblock_nr_pages, + zone->nr_reserved_highatomic); + } /* * Convert to ac->migratetype and avoid the normal From e1e6620f042cd7a6b1846335c46ca7b9897bc823 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Jun 2017 23:10:41 +0200 Subject: [PATCH 202/546] tty: improve tty_insert_flip_char() fast path commit 979990c6284814617d8f2179d197f72ff62b5d85 upstream. kernelci.org reports a crazy stack usage for the VT code when CONFIG_KASAN is enabled: drivers/tty/vt/keyboard.c: In function 'kbd_keycode': drivers/tty/vt/keyboard.c:1452:1: error: the frame size of 2240 bytes is larger than 2048 bytes [-Werror=frame-larger-than=] The problem is that tty_insert_flip_char() gets inlined many times into kbd_keycode(), and also into other functions, and each copy requires 128 bytes for stack redzone to check for a possible out-of-bounds access on the 'ch' and 'flags' arguments that are passed into tty_insert_flip_string_flags as a variable-length string. This introduces a new __tty_insert_flip_char() function for the slow path, which receives the two arguments by value. This completely avoids the problem and the stack usage goes back down to around 100 bytes. Without KASAN, this is also slightly better, as we don't have to spill the arguments to the stack but can simply pass 'ch' and 'flag' in registers, saving a few bytes in .text for each call site. This should be backported to linux-4.0 or later, which first introduced the stack sanitizer in the kernel. Fixes: c420f167db8c ("kasan: enable stack instrumentation") Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 24 ++++++++++++++++++++++++ include/linux/tty_flip.h | 3 ++- 2 files changed, 26 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index fb31eecb708d..c478da54fd45 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -361,6 +361,30 @@ int tty_insert_flip_string_flags(struct tty_port *port, } EXPORT_SYMBOL(tty_insert_flip_string_flags); +/** + * __tty_insert_flip_char - Add one character to the tty buffer + * @port: tty port + * @ch: character + * @flag: flag byte + * + * Queue a single byte to the tty buffering, with an optional flag. + * This is the slow path of tty_insert_flip_char. + */ +int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) +{ + struct tty_buffer *tb = port->buf.tail; + int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0; + + if (!tty_buffer_request_room(port, 1)) + return 0; + + *flag_buf_ptr(tb, tb->used) = flag; + *char_buf_ptr(tb, tb->used++) = ch; + + return 1; +} +EXPORT_SYMBOL(__tty_insert_flip_char); + /** * tty_schedule_flip - push characters to ldisc * @port: tty port to push from diff --git a/include/linux/tty_flip.h b/include/linux/tty_flip.h index c28dd523f96e..d43837f2ce3a 100644 --- a/include/linux/tty_flip.h +++ b/include/linux/tty_flip.h @@ -12,6 +12,7 @@ extern int tty_prepare_flip_string(struct tty_port *port, unsigned char **chars, size_t size); extern void tty_flip_buffer_push(struct tty_port *port); void tty_schedule_flip(struct tty_port *port); +int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag); static inline int tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) @@ -26,7 +27,7 @@ static inline int tty_insert_flip_char(struct tty_port *port, *char_buf_ptr(tb, tb->used++) = ch; return 1; } - return tty_insert_flip_string_flags(port, &ch, &flag, 1); + return __tty_insert_flip_char(port, ch, flag); } static inline int tty_insert_flip_string(struct tty_port *port, From 077933dcd5cabd45e82aceab45dec772ebecbd09 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 20 Jun 2017 23:10:42 +0200 Subject: [PATCH 203/546] tty: improve tty_insert_flip_char() slow path commit 065ea0a7afd64d6cf3464bdd1d8cd227527e2045 upstream. While working on improving the fast path of tty_insert_flip_char(), I noticed that by calling tty_buffer_request_room(), we needlessly move to the separate flag buffer mode for the tty, even when all characters use TTY_NORMAL as the flag. This changes the code to call __tty_buffer_request_room() with the correct flag, which will then allocate a regular buffer when it rounds out of space but no special flags have been used. I'm guessing that this is the behavior that Peter Hurley intended when he introduced the compacted flip buffers. Fixes: acc0f67f307f ("tty: Halve flip buffer GFP_ATOMIC memory consumption") Cc: Peter Hurley Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index c478da54fd45..e4d0ef844d9d 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -375,10 +375,11 @@ int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) struct tty_buffer *tb = port->buf.tail; int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0; - if (!tty_buffer_request_room(port, 1)) + if (!__tty_buffer_request_room(port, 1, flags)) return 0; - *flag_buf_ptr(tb, tb->used) = flag; + if (~tb->flags & TTYB_NORMAL) + *flag_buf_ptr(tb, tb->used) = flag; *char_buf_ptr(tb, tb->used++) = ch; return 1; From c13c5c7e88d79cae57ac25c6a3946cb17418ae3f Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Aug 2017 13:11:39 +0200 Subject: [PATCH 204/546] tty: fix __tty_insert_flip_char regression commit 8a5a90a2a477b86a3dc2eaa5a706db9bfdd647ca upstream. Sergey noticed a small but fatal mistake in __tty_insert_flip_char, leading to an oops in an interrupt handler when using any serial port. The problem is that I accidentally took the tty_buffer pointer before calling __tty_buffer_request_room(), which replaces the buffer. This moves the pointer lookup to the right place after allocating the new buffer space. Fixes: 979990c62848 ("tty: improve tty_insert_flip_char() fast path") Reported-by: Sergey Senozhatsky Tested-by: Sergey Senozhatsky Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/tty/tty_buffer.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index e4d0ef844d9d..8f3566cde3eb 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -372,12 +372,13 @@ EXPORT_SYMBOL(tty_insert_flip_string_flags); */ int __tty_insert_flip_char(struct tty_port *port, unsigned char ch, char flag) { - struct tty_buffer *tb = port->buf.tail; + struct tty_buffer *tb; int flags = (flag == TTY_NORMAL) ? TTYB_NORMAL : 0; if (!__tty_buffer_request_room(port, 1, flags)) return 0; + tb = port->buf.tail; if (~tb->flags & TTYB_NORMAL) *flag_buf_ptr(tb, tb->used) = flag; *char_buf_ptr(tb, tb->used++) = ch; From bf592dde1262c7c3a65d879d09272902d3fa7c6b Mon Sep 17 00:00:00 2001 From: Kai-Heng Feng Date: Fri, 15 Sep 2017 09:36:16 -0700 Subject: [PATCH 205/546] Input: i8042 - add Gigabyte P57 to the keyboard reset table commit 697c5d8a36768b36729533fb44622b35d56d6ad0 upstream. Similar to other Gigabyte laptops, the touchpad on P57 requires a keyboard reset to detect Elantech touchpad correctly. BugLink: https://bugs.launchpad.net/bugs/1594214 Signed-off-by: Kai-Heng Feng Signed-off-by: Dmitry Torokhov Signed-off-by: Greg Kroah-Hartman --- drivers/input/serio/i8042-x86ia64io.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/input/serio/i8042-x86ia64io.h b/drivers/input/serio/i8042-x86ia64io.h index 5be14ad29d46..dbf09836ff30 100644 --- a/drivers/input/serio/i8042-x86ia64io.h +++ b/drivers/input/serio/i8042-x86ia64io.h @@ -904,6 +904,13 @@ static const struct dmi_system_id __initconst i8042_dmi_kbdreset_table[] = { DMI_MATCH(DMI_PRODUCT_NAME, "P34"), }, }, + { + /* Gigabyte P57 - Elantech touchpad */ + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "GIGABYTE"), + DMI_MATCH(DMI_PRODUCT_NAME, "P57"), + }, + }, { /* Schenker XMG C504 - Elantech touchpad */ .matches = { From b6c818d813c66ac86818f0548cf27e1001c0e0c0 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:48 +0200 Subject: [PATCH 206/546] MIPS: math-emu: .: Fix quiet NaN propagation commit e78bf0dc4789bdea1453595ae89e8db65918e22e upstream. Fix the value returned by . fd,fs,ft, if both inputs are quiet NaNs. The . specifications state that the returned value in such cases should be the quiet NaN contained in register fs. A relevant example: MAX.S fd,fs,ft: If fs contains qNaN1, and ft contains qNaN2, fd is going to contain qNaN1 (without this patch, it used to contain qNaN2). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16880/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 32 ++++++++++++++++++++++++++++---- arch/mips/math-emu/dp_fmin.c | 32 ++++++++++++++++++++++++++++---- arch/mips/math-emu/sp_fmax.c | 32 ++++++++++++++++++++++++++++---- arch/mips/math-emu/sp_fmin.c | 32 ++++++++++++++++++++++++++++---- 4 files changed, 112 insertions(+), 16 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index fd71b8daaaf2..41bd6ed852b9 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -47,14 +47,26 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index c1072b0dfb95..53fb8c904e32 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -47,14 +47,26 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index 4d000844e48e..d0d73c3226dc 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -47,14 +47,26 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 4eb1bb9e9dec..011692e326af 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -47,14 +47,26 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -147,14 +159,26 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): From 6acd1d26c32ea6e9d38f4839ba921cc3780bb205 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:49 +0200 Subject: [PATCH 207/546] MIPS: math-emu: .: Fix cases of both inputs zero commit 15560a58bfd4ff82cdd16b2270d4ef9b06d2cc4d upstream. Fix the value returned by ., if both inputs are zeros. The right behavior in such cases is stated in instruction reference manual and is as follows: fs ft MAX MIN MAXA MINA --------------------------------------------- 0 0 0 0 0 0 0 -0 0 -0 0 -0 -0 0 0 -0 0 -0 -0 -0 -0 -0 -0 -0 Prior to this patch, some of the above cases were yielding correct results. However, for the sake of code consistency, all such cases are rewritten in this patch. A relevant example: MAX.S fd,fs,ft: If fs contains +0.0, and ft contains -0.0, fd is going to contain +0.0 (without this patch, it used to contain -0.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16881/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 8 ++------ arch/mips/math-emu/dp_fmin.c | 8 ++------ arch/mips/math-emu/sp_fmax.c | 8 ++------ arch/mips/math-emu/sp_fmin.c | 8 ++------ 4 files changed, 8 insertions(+), 24 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 41bd6ed852b9..31f091a7819b 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -92,9 +92,7 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y) return ys ? x : y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -204,9 +202,7 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 53fb8c904e32..e607d55208ad 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -92,9 +92,7 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y) return ys ? y : x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -204,9 +202,7 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index d0d73c3226dc..3ca5b204e9d0 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -92,9 +92,7 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y) return ys ? x : y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -204,9 +202,7 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 011692e326af..c982647df39a 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -92,9 +92,7 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y) return ys ? y : x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -204,9 +202,7 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; From a83ffb581f2675250d0127d686a918446975b3e2 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:50 +0200 Subject: [PATCH 208/546] MIPS: math-emu: .: Fix cases of both inputs negative commit aabf5cf02e22ebc4e541adf835910f388b6c3e65 upstream. Fix the value returned by ., if both inputs are negative normal fp numbers. The previous logic did not take into account that if both inputs have the same sign, there should be separate treatment of the cases when both inputs are negative and when both inputs are positive. A relevant example: MAX.S fd,fs,ft: If fs contains -5.0, and ft contains -7.0, fd is going to contain -5.0 (without this patch, it used to contain -7.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16882/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 32 ++++++++++++++++++++++++-------- arch/mips/math-emu/dp_fmin.c | 32 ++++++++++++++++++++++++-------- arch/mips/math-emu/sp_fmax.c | 32 ++++++++++++++++++++++++-------- arch/mips/math-emu/sp_fmin.c | 32 ++++++++++++++++++++++++-------- 4 files changed, 96 insertions(+), 32 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 31f091a7819b..0b53c7861101 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -116,16 +116,32 @@ union ieee754dp ieee754dp_fmax(union ieee754dp x, union ieee754dp y) else if (xs < ys) return x; - /* Compare exponent */ - if (xe > ye) - return x; - else if (xe < ye) - return y; + /* Signs of inputs are equal, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } else { + /* Inputs are both negative */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return y; + return x; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return y; - return x; + return x; + return y; } union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index e607d55208ad..099e6bd55353 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -116,16 +116,32 @@ union ieee754dp ieee754dp_fmin(union ieee754dp x, union ieee754dp y) else if (xs < ys) return y; - /* Compare exponent */ - if (xe > ye) - return y; - else if (xe < ye) - return x; + /* Signs of inputs are the same, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } else { + /* Inputs are both negative */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return x; + return y; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return x; - return y; + return y; + return x; } union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index 3ca5b204e9d0..7efa7729bd85 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -116,16 +116,32 @@ union ieee754sp ieee754sp_fmax(union ieee754sp x, union ieee754sp y) else if (xs < ys) return x; - /* Compare exponent */ - if (xe > ye) - return x; - else if (xe < ye) - return y; + /* Signs of inputs are equal, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } else { + /* Inputs are both negative */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return y; + return x; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return y; - return x; + return x; + return y; } union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index c982647df39a..e2c554359f7b 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -116,16 +116,32 @@ union ieee754sp ieee754sp_fmin(union ieee754sp x, union ieee754sp y) else if (xs < ys) return y; - /* Compare exponent */ - if (xe > ye) - return y; - else if (xe < ye) - return x; + /* Signs of inputs are the same, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } else { + /* Inputs are both negative */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return x; + return y; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return x; - return y; + return y; + return x; } union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) From 322bf697bdc4ed16c9ec89d0253c3a01023e51f4 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:51 +0200 Subject: [PATCH 209/546] MIPS: math-emu: .: Fix cases of input values with opposite signs commit 1a41b3b441508ae63b1a9ec699ec94065739eb60 upstream. Fix the value returned by ., if the inputs are normal fp numbers of the same absolute value, but opposite signs. A relevant example: MAXA.S fd,fs,ft: If fs contains -3.0, and ft contains +3.0, fd is going to contain +3.0 (without this patch, it used to contain -3.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16883/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 8 ++++++-- arch/mips/math-emu/dp_fmin.c | 6 +++++- arch/mips/math-emu/sp_fmax.c | 8 ++++++-- arch/mips/math-emu/sp_fmin.c | 6 +++++- 4 files changed, 22 insertions(+), 6 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 0b53c7861101..81d12bfa6d5e 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -243,7 +243,11 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) return y; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) return y; - return x; + else if (xm > ym) + return x; + else if (xs == 0) + return x; + return y; } diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 099e6bd55353..4574f04b44ce 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -243,7 +243,11 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) return x; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) + return x; + else if (xm > ym) + return y; + else if (xs == 1) return x; return y; } diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index 7efa7729bd85..fb4149762101 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -243,7 +243,11 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) return y; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) return y; - return x; + else if (xm > ym) + return x; + else if (xs == 0) + return x; + return y; } diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index e2c554359f7b..7915b9430f68 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -243,7 +243,11 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) return x; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) + return x; + else if (xm > ym) + return y; + else if (xs == 1) return x; return y; } From f4d77fc754f2be1db6815cd8ccf26c9c0514a96f Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:52 +0200 Subject: [PATCH 210/546] MIPS: math-emu: .: Fix cases of both infinite inputs commit 3444c4eb534c20e44f0d6670b34263efaf8b531f upstream. Fix the value returned by . fd,fs,ft, if both inputs are infinite. The previous implementation returned always the value contained in ft in such cases. The correct behavior is specified in Mips instruction set manual and is as follows: fs ft MAXA MINA --------------------------------- inf inf inf inf inf -inf inf -inf -inf inf inf -inf -inf -inf -inf -inf A relevant example: MAXA.S fd,fs,ft: If fs contains +inf, and ft contains -inf, fd is going to contain +inf (without this patch, it used to contain -inf). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16884/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmax.c | 4 +++- arch/mips/math-emu/dp_fmin.c | 4 +++- arch/mips/math-emu/sp_fmax.c | 4 +++- arch/mips/math-emu/sp_fmin.c | 4 +++- 4 files changed, 12 insertions(+), 4 deletions(-) diff --git a/arch/mips/math-emu/dp_fmax.c b/arch/mips/math-emu/dp_fmax.c index 81d12bfa6d5e..5bec64f2884e 100644 --- a/arch/mips/math-emu/dp_fmax.c +++ b/arch/mips/math-emu/dp_fmax.c @@ -202,6 +202,9 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754dp_inf(xs & ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 4574f04b44ce..2495bd7333f5 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -202,6 +202,9 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754dp_inf(xs | ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): diff --git a/arch/mips/math-emu/sp_fmax.c b/arch/mips/math-emu/sp_fmax.c index fb4149762101..74a5a00d2f22 100644 --- a/arch/mips/math-emu/sp_fmax.c +++ b/arch/mips/math-emu/sp_fmax.c @@ -202,6 +202,9 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754sp_inf(xs & ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 7915b9430f68..42ec4315e66c 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -202,6 +202,9 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754sp_inf(xs | ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -209,7 +212,6 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): From 9354f4d0beb05f17a9c06315b8c00b25d6c97095 Mon Sep 17 00:00:00 2001 From: Aleksandar Markovic Date: Thu, 27 Jul 2017 18:08:53 +0200 Subject: [PATCH 211/546] MIPS: math-emu: MINA.: Fix some cases of infinity and zero inputs commit 304bfe473e70523e591fb1c9223289d355e0bdcb upstream. Fix following special cases for MINA>.: - if one of the inputs is zero, and the other is subnormal, normal, or infinity, the value of the former should be returned (that is, a zero). - if one of the inputs is infinity, and the other input is normal, or subnormal, the value of the latter should be returned. The previous implementation's logic for such cases was incorrect - it appears as if it implements MAXA, and not MINA instruction. A relevant example: MINA.S fd,fs,ft: If fs contains 100.0, and ft contains 0.0, fd is going to contain 0.0 (without this patch, it used to contain 100.0). Fixes: a79f5f9ba508 ("MIPS: math-emu: Add support for the MIPS R6 MAX{, A} FPU instruction") Fixes: 4e9561b20e2f ("MIPS: math-emu: Add support for the MIPS R6 MIN{, A} FPU instruction") Signed-off-by: Miodrag Dinic Signed-off-by: Goran Ferenc Signed-off-by: Aleksandar Markovic Reviewed-by: James Hogan Cc: Bo Hu Cc: Douglas Leung Cc: Jin Qian Cc: Paul Burton Cc: Petar Jovanovic Cc: Raghu Gandham Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/16885/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/dp_fmin.c | 4 ++-- arch/mips/math-emu/sp_fmin.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/mips/math-emu/dp_fmin.c b/arch/mips/math-emu/dp_fmin.c index 2495bd7333f5..a287b23818d8 100644 --- a/arch/mips/math-emu/dp_fmin.c +++ b/arch/mips/math-emu/dp_fmin.c @@ -210,14 +210,14 @@ union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): - return x; + return y; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM): - return y; + return x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): return ieee754dp_zero(xs | ys); diff --git a/arch/mips/math-emu/sp_fmin.c b/arch/mips/math-emu/sp_fmin.c index 42ec4315e66c..c51385f46b09 100644 --- a/arch/mips/math-emu/sp_fmin.c +++ b/arch/mips/math-emu/sp_fmin.c @@ -210,14 +210,14 @@ union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): - return x; + return y; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM): - return y; + return x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): return ieee754sp_zero(xs | ys); From 5e9d28b003b0312bc1c17994edb84bbb9a4a060a Mon Sep 17 00:00:00 2001 From: Stephan Mueller Date: Thu, 21 Sep 2017 10:16:53 +0200 Subject: [PATCH 212/546] crypto: AF_ALG - remove SGL terminator indicator when chaining Fixed differently upstream as commit 2d97591ef43d ("crypto: af_alg - consolidation of duplicate code") The SGL is MAX_SGL_ENTS + 1 in size. The last SG entry is used for the chaining and is properly updated with the sg_chain invocation. During the filling-in of the initial SG entries, sg_mark_end is called for each SG entry. This is appropriate as long as no additional SGL is chained with the current SGL. However, when a new SGL is chained and the last SG entry is updated with sg_chain, the last but one entry still contains the end marker from the sg_mark_end. This end marker must be removed as otherwise a walk of the chained SGLs will cause a NULL pointer dereference at the last but one SG entry, because sg_next will return NULL. The patch only applies to all kernels up to and including 4.13. The patch 2d97591ef43d0587be22ad1b0d758d6df4999a0b added to 4.14-rc1 introduced a complete new code base which addresses this bug in a different way. Yet, that patch is too invasive for stable kernels and was therefore not marked for stable. Fixes: 8ff590903d5fc ("crypto: algif_skcipher - User-space interface for skcipher operations") Signed-off-by: Stephan Mueller Acked-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- crypto/algif_skcipher.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index b3b0004ea8ac..d12782dc9683 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -143,8 +143,10 @@ static int skcipher_alloc_sgl(struct sock *sk) sg_init_table(sgl->sg, MAX_SGL_ENTS + 1); sgl->cur = 0; - if (sg) + if (sg) { sg_chain(sg, MAX_SGL_ENTS + 1, sgl->sg); + sg_unmark_end(sg + (MAX_SGL_ENTS - 1)); + } list_add_tail(&sgl->list, &ctx->tsgl); } From cd46241eb03ca90f58553ad15e484c4f86d6fd64 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 24 Aug 2017 15:19:39 -0400 Subject: [PATCH 213/546] ext4: fix incorrect quotaoff if the quota feature is enabled commit b0a5a9589decd07db755d6a8d9c0910d96ff7992 upstream. Current ext4 quota should always "usage enabled" if the quota feautre is enabled. But in ext4_orphan_cleanup(), it turn quotas off directly (used for the older journaled quota), so we cannot turn it on again via "quotaon" unless umount and remount ext4. Simple reproduce: mkfs.ext4 -O project,quota /dev/vdb1 mount -o prjquota /dev/vdb1 /mnt chattr -p 123 /mnt chattr +P /mnt touch /mnt/aa /mnt/bb exec 100<>/mnt/aa rm -f /mnt/aa sync echo c > /proc/sysrq-trigger #reboot and mount mount -o prjquota /dev/vdb1 /mnt #query status quotaon -Ppv /dev/vdb1 #output quotaon: Cannot find mountpoint for device /dev/vdb1 quotaon: No correct mountpoint specified. This patch add check for journaled quotas to avoid incorrect quotaoff when ext4 has quota feautre. Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 68345a9e59b8..079b4ed457cb 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2243,7 +2243,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on quotas so that they are updated correctly */ + /* Turn on journaled quotas so that they are updated correctly */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); @@ -2309,9 +2309,9 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn quotas off */ + /* Turn off journaled quotas if they were enabled for orphan cleanup */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (sb_dqopt(sb)->files[i]) + if (EXT4_SB(sb)->s_qf_names[i] && sb_dqopt(sb)->files[i]) dquot_quota_off(sb, i); } #endif From c53f01698f68a3d6880c27e24a4e98ca6cd9feb9 Mon Sep 17 00:00:00 2001 From: "zhangyi (F)" Date: Thu, 24 Aug 2017 15:21:50 -0400 Subject: [PATCH 214/546] ext4: fix quota inconsistency during orphan cleanup for read-only mounts commit 95f1fda47c9d8738f858c3861add7bf0a36a7c0b upstream. Quota does not get enabled for read-only mounts if filesystem has quota feature, so that quotas cannot updated during orphan cleanup, which will lead to quota inconsistency. This patch turn on quotas during orphan cleanup for this case, make sure quotas can be updated correctly. Reported-by: Jan Kara Signed-off-by: zhangyi (F) Signed-off-by: Theodore Ts'o Reviewed-by: Jan Kara Signed-off-by: Greg Kroah-Hartman --- fs/ext4/super.c | 38 +++++++++++++++++++++++++++++++------- 1 file changed, 31 insertions(+), 7 deletions(-) diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 079b4ed457cb..32941cd6d34b 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -2205,6 +2205,7 @@ static void ext4_orphan_cleanup(struct super_block *sb, unsigned int s_flags = sb->s_flags; int nr_orphans = 0, nr_truncates = 0; #ifdef CONFIG_QUOTA + int quota_update = 0; int i; #endif if (!es->s_last_orphan) { @@ -2243,14 +2244,32 @@ static void ext4_orphan_cleanup(struct super_block *sb, #ifdef CONFIG_QUOTA /* Needed for iput() to work correctly and not trash data */ sb->s_flags |= MS_ACTIVE; - /* Turn on journaled quotas so that they are updated correctly */ + + /* + * Turn on quotas which were not enabled for read-only mounts if + * filesystem has quota feature, so that they are updated correctly. + */ + if (ext4_has_feature_quota(sb) && (s_flags & MS_RDONLY)) { + int ret = ext4_enable_quotas(sb); + + if (!ret) + quota_update = 1; + else + ext4_msg(sb, KERN_ERR, + "Cannot turn on quotas: error %d", ret); + } + + /* Turn on journaled quotas used for old sytle */ for (i = 0; i < EXT4_MAXQUOTAS; i++) { if (EXT4_SB(sb)->s_qf_names[i]) { int ret = ext4_quota_on_mount(sb, i); - if (ret < 0) + + if (!ret) + quota_update = 1; + else ext4_msg(sb, KERN_ERR, "Cannot turn on journaled " - "quota: error %d", ret); + "quota: type %d: error %d", i, ret); } } #endif @@ -2309,10 +2328,12 @@ static void ext4_orphan_cleanup(struct super_block *sb, ext4_msg(sb, KERN_INFO, "%d truncate%s cleaned up", PLURAL(nr_truncates)); #ifdef CONFIG_QUOTA - /* Turn off journaled quotas if they were enabled for orphan cleanup */ - for (i = 0; i < EXT4_MAXQUOTAS; i++) { - if (EXT4_SB(sb)->s_qf_names[i] && sb_dqopt(sb)->files[i]) - dquot_quota_off(sb, i); + /* Turn off quotas if they were enabled for orphan cleanup */ + if (quota_update) { + for (i = 0; i < EXT4_MAXQUOTAS; i++) { + if (sb_dqopt(sb)->files[i]) + dquot_quota_off(sb, i); + } } #endif sb->s_flags = s_flags; /* Restore MS_RDONLY status */ @@ -5120,6 +5141,9 @@ static int ext4_enable_quotas(struct super_block *sb) err = ext4_quota_enable(sb, type, QFMT_VFS_V1, DQUOT_USAGE_ENABLED); if (err) { + for (type--; type >= 0; type--) + dquot_quota_off(sb, type); + ext4_warning(sb, "Failed to enable quota tracking " "(type=%d, err=%d). Please run " From a918d32583e0d4ed9d9aac49988c5dfa384f3ede Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Thu, 24 Aug 2017 20:49:57 +1000 Subject: [PATCH 215/546] powerpc: Fix DAR reporting when alignment handler faults commit f9effe925039cf54489b5c04e0d40073bb3a123d upstream. Anton noticed that if we fault part way through emulating an unaligned instruction, we don't update the DAR to reflect that. The DAR value is eventually reported back to userspace as the address in the SEGV signal, and if userspace is using that value to demand fault then it can be confused by us not setting the value correctly. This patch is ugly as hell, but is intended to be the minimal fix and back ports easily. Signed-off-by: Michael Ellerman Reviewed-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kernel/align.c | 119 ++++++++++++++++++++++-------------- 1 file changed, 74 insertions(+), 45 deletions(-) diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 91e5c1758b5c..64e016abb2a5 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -236,6 +236,28 @@ static int emulate_dcbz(struct pt_regs *regs, unsigned char __user *addr) #define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) +#define __get_user_or_set_dar(_regs, _dest, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__get_user_inatomic(_dest, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + +#define __put_user_or_set_dar(_regs, _src, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__put_user_inatomic(_src, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, unsigned int reg, unsigned int nb, unsigned int flags, unsigned int instr, @@ -264,9 +286,10 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, } else { unsigned long pc = regs->nip ^ (swiz & 4); - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) + if (__get_user_or_set_dar(regs, instr, + (unsigned int __user *)pc)) return -EFAULT; + if (swiz == 0 && (flags & SW)) instr = cpu_to_le32(instr); nb = (instr >> 11) & 0x1f; @@ -310,31 +333,31 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, ((nb0 + 3) / 4) * sizeof(unsigned long)); for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } else { for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } @@ -346,29 +369,32 @@ static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, * Only POWER6 has these instructions, and it does true little-endian, * so we don't need the address swizzling. */ -static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, - unsigned int flags) +static int emulate_fp_pair(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int flags) { char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); - int i, ret, sw = 0; + int i, sw = 0; if (reg & 1) return 0; /* invalid form: FRS/FRT must be even */ if (flags & SW) sw = 7; - ret = 0; + for (i = 0; i < 8; ++i) { if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); + if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); + if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } } - if (ret) - return -EFAULT; + return 1; /* exception handled and fixed up */ } @@ -378,24 +404,27 @@ static int emulate_lq_stq(struct pt_regs *regs, unsigned char __user *addr, { char *ptr0 = (char *)®s->gpr[reg]; char *ptr1 = (char *)®s->gpr[reg+1]; - int i, ret, sw = 0; + int i, sw = 0; if (reg & 1) return 0; /* invalid form: GPR must be even */ if (flags & SW) sw = 7; - ret = 0; + for (i = 0; i < 8; ++i) { if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); + if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); + if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } } - if (ret) - return -EFAULT; + return 1; /* exception handled and fixed up */ } #endif /* CONFIG_PPC64 */ @@ -688,9 +717,14 @@ static int emulate_vsx(unsigned char __user *addr, unsigned int reg, for (j = 0; j < length; j += elsize) { for (i = 0; i < elsize; ++i) { if (flags & ST) - ret |= __put_user(ptr[i^sw], addr + i); + ret = __put_user_or_set_dar(regs, ptr[i^sw], + addr + i); else - ret |= __get_user(ptr[i^sw], addr + i); + ret = __get_user_or_set_dar(regs, ptr[i^sw], + addr + i); + + if (ret) + return ret; } ptr += elsize; #ifdef __LITTLE_ENDIAN__ @@ -740,7 +774,7 @@ int fix_alignment(struct pt_regs *regs) unsigned int dsisr; unsigned char __user *addr; unsigned long p, swiz; - int ret, i; + int i; union data { u64 ll; double dd; @@ -923,7 +957,7 @@ int fix_alignment(struct pt_regs *regs) if (flags & F) { /* Special case for 16-byte FP loads and stores */ PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); + return emulate_fp_pair(regs, addr, reg, flags); } else { #ifdef CONFIG_PPC64 /* Special case for 16-byte loads and stores */ @@ -953,15 +987,12 @@ int fix_alignment(struct pt_regs *regs) } data.ll = 0; - ret = 0; p = (unsigned long)addr; for (i = 0; i < nb; i++) - ret |= __get_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; + if (__get_user_or_set_dar(regs, data.v[start + i], + SWIZ_PTR(p++))) + return -EFAULT; } else if (flags & F) { data.ll = current->thread.TS_FPR(reg); @@ -1031,15 +1062,13 @@ int fix_alignment(struct pt_regs *regs) break; } - ret = 0; p = (unsigned long)addr; for (i = 0; i < nb; i++) - ret |= __put_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); + if (__put_user_or_set_dar(regs, data.v[start + i], + SWIZ_PTR(p++))) + return -EFAULT; - if (unlikely(ret)) - return -EFAULT; } else if (flags & F) current->thread.TS_FPR(reg) = data.ll; else From 30e81e7fe197dd14d5b7653c75140ea75fe5c3d4 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:44 -0700 Subject: [PATCH 216/546] block: Relax a check in blk_start_queue() commit 4ddd56b003f251091a67c15ae3fe4a5c5c5e390a upstream. Calling blk_start_queue() from interrupt context with the queue lock held and without disabling IRQs, as the skd driver does, is safe. This patch avoids that loading the skd driver triggers the following warning: WARNING: CPU: 11 PID: 1348 at block/blk-core.c:283 blk_start_queue+0x84/0xa0 RIP: 0010:blk_start_queue+0x84/0xa0 Call Trace: skd_unquiesce_dev+0x12a/0x1d0 [skd] skd_complete_internal+0x1e7/0x5a0 [skd] skd_complete_other+0xc2/0xd0 [skd] skd_isr_completion_posted.isra.30+0x2a5/0x470 [skd] skd_isr+0x14f/0x180 [skd] irq_forced_thread_fn+0x2a/0x70 irq_thread+0x144/0x1a0 kthread+0x125/0x140 ret_from_fork+0x2a/0x40 Fixes: commit a038e2536472 ("[PATCH] blk_start_queue() must be called with irq disabled - add warning") Signed-off-by: Bart Van Assche Cc: Paolo 'Blaisorblade' Giarrusso Cc: Andrew Morton Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/blk-core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/block/blk-core.c b/block/blk-core.c index ef083e7a37c5..119658534dfd 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -233,7 +233,7 @@ EXPORT_SYMBOL(blk_start_queue_async); **/ void blk_start_queue(struct request_queue *q) { - WARN_ON(!irqs_disabled()); + WARN_ON(!in_interrupt() && !irqs_disabled()); queue_flag_clear(QUEUE_FLAG_STOPPED, q); __blk_run_queue(q); From f05dafbd779112307e8dfe0ed8226910d29f0020 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Thu, 31 Aug 2017 10:23:25 +1000 Subject: [PATCH 217/546] md/bitmap: disable bitmap_resize for file-backed bitmaps. commit e8a27f836f165c26f867ece7f31eb5c811692319 upstream. bitmap_resize() does not work for file-backed bitmaps. The buffer_heads are allocated and initialized when the bitmap is read from the file, but resize doesn't read from the file, it loads from the internal bitmap. When it comes time to write the new bitmap, the bh is non-existent and we crash. The common case when growing an array involves making the array larger, and that normally means making the bitmap larger. Doing that inside the kernel is possible, but would need more code. It is probably easier to require people who use file-backed bitmaps to remove them and re-add after a reshape. So this patch disables the resizing of arrays which have file-backed bitmaps. This is better than crashing. Reported-by: Zhilong Liu Fixes: d60b479d177a ("md/bitmap: add bitmap_resize function to allow bitmap resizing.") Signed-off-by: NeilBrown Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/bitmap.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/md/bitmap.c b/drivers/md/bitmap.c index 4f22e919787a..7a50728b9389 100644 --- a/drivers/md/bitmap.c +++ b/drivers/md/bitmap.c @@ -1960,6 +1960,11 @@ int bitmap_resize(struct bitmap *bitmap, sector_t blocks, long pages; struct bitmap_page *new_bp; + if (bitmap->storage.file && !init) { + pr_info("md: cannot resize file-based bitmap\n"); + return -EINVAL; + } + if (chunksize == 0) { /* If there is enough space, leave the chunk size unchanged, * else increase by factor of two until there is enough space. From 0bcaf5178fe6cc3169d4ef47e92e84e938bf7b3c Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:45 -0700 Subject: [PATCH 218/546] skd: Avoid that module unloading triggers a use-after-free commit 7277cc67b3916eed47558c64f9c9c0de00a35cda upstream. Since put_disk() triggers a disk_release() call and since that last function calls blk_put_queue() if disk->queue != NULL, clear the disk->queue pointer before calling put_disk(). This avoids that unloading the skd kernel module triggers the following use-after-free: WARNING: CPU: 8 PID: 297 at lib/refcount.c:128 refcount_sub_and_test+0x70/0x80 refcount_t: underflow; use-after-free. CPU: 8 PID: 297 Comm: kworker/8:1 Not tainted 4.11.10-300.fc26.x86_64 #1 Workqueue: events work_for_cpu_fn Call Trace: dump_stack+0x63/0x84 __warn+0xcb/0xf0 warn_slowpath_fmt+0x5a/0x80 refcount_sub_and_test+0x70/0x80 refcount_dec_and_test+0x11/0x20 kobject_put+0x1f/0x50 blk_put_queue+0x15/0x20 disk_release+0xae/0xf0 device_release+0x32/0x90 kobject_release+0x67/0x170 kobject_put+0x2b/0x50 put_disk+0x17/0x20 skd_destruct+0x5c/0x890 [skd] skd_pci_probe+0x124d/0x13a0 [skd] local_pci_probe+0x42/0xa0 work_for_cpu_fn+0x14/0x20 process_one_work+0x19e/0x470 worker_thread+0x1dc/0x4a0 kthread+0x125/0x140 ret_from_fork+0x25/0x30 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/skd_main.c | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index 586f9168ffa4..a1f8bf96d8b3 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -4679,15 +4679,16 @@ static void skd_free_disk(struct skd_device *skdev) { struct gendisk *disk = skdev->disk; - if (disk != NULL) { - struct request_queue *q = disk->queue; + if (disk && (disk->flags & GENHD_FL_UP)) + del_gendisk(disk); - if (disk->flags & GENHD_FL_UP) - del_gendisk(disk); - if (q) - blk_cleanup_queue(q); - put_disk(disk); + if (skdev->queue) { + blk_cleanup_queue(skdev->queue); + skdev->queue = NULL; + disk->queue = NULL; } + + put_disk(disk); skdev->disk = NULL; } From 19978c50db689ab0691080a65d4a635aebd0f6a7 Mon Sep 17 00:00:00 2001 From: Bart Van Assche Date: Thu, 17 Aug 2017 13:12:46 -0700 Subject: [PATCH 219/546] skd: Submit requests to firmware before triggering the doorbell commit 5fbd545cd3fd311ea1d6e8be4cedddd0ee5684c7 upstream. Ensure that the members of struct skd_msg_buf have been transferred to the PCIe adapter before the doorbell is triggered. This patch avoids that I/O fails sporadically and that the following error message is reported: (skd0:STM000196603:[0000:00:09.0]): Completion mismatch comp_id=0x0000 skreq=0x0400 new=0x0000 Signed-off-by: Bart Van Assche Cc: Christoph Hellwig Cc: Hannes Reinecke Cc: Johannes Thumshirn Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/block/skd_main.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/block/skd_main.c b/drivers/block/skd_main.c index a1f8bf96d8b3..47d1e834f3f4 100644 --- a/drivers/block/skd_main.c +++ b/drivers/block/skd_main.c @@ -2214,6 +2214,9 @@ static void skd_send_fitmsg(struct skd_device *skdev, */ qcmd |= FIT_QCMD_MSGSIZE_64; + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ + smp_wmb(); + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); } @@ -2260,6 +2263,9 @@ static void skd_send_special_fitmsg(struct skd_device *skdev, qcmd = skspcl->mb_dma_address; qcmd |= FIT_QCMD_QID_NORMAL + FIT_QCMD_MSGSIZE_128; + /* Make sure skd_msg_buf is written before the doorbell is triggered. */ + smp_wmb(); + SKD_WRITEQ(skdev, qcmd, FIT_Q_COMMAND); } From cfc49967434db15f202204eae4306b2a78d9ea03 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:51 +0200 Subject: [PATCH 220/546] scsi: zfcp: fix queuecommand for scsi_eh commands when DIX enabled commit 71b8e45da51a7b64a23378221c0a5868bd79da4f upstream. Since commit db007fc5e20c ("[SCSI] Command protection operation"), scsi_eh_prep_cmnd() saves scmd->prot_op and temporarily resets it to SCSI_PROT_NORMAL. Other FCP LLDDs such as qla2xxx and lpfc shield their queuecommand() to only access any of scsi_prot_sg...() if (scsi_get_prot_op(cmd) != SCSI_PROT_NORMAL). Do the same thing for zfcp, which introduced DIX support with commit ef3eb71d8ba4 ("[SCSI] zfcp: Introduce experimental support for DIF/DIX"). Otherwise, TUR SCSI commands as part of scsi_eh likely fail in zfcp, because the regular SCSI command with DIX protection data, that scsi_eh re-uses in scsi_send_eh_cmnd(), of course still has (scsi_prot_sg_count() != 0) and so zfcp sends down bogus requests to the FCP channel hardware. This causes scsi_eh_test_devices() to have (finish_cmds == 0) [not SCSI device is online or not scsi_eh_tur() failed] so regular SCSI commands, that caused / were affected by scsi_eh, are moved to work_q and scsi_eh_test_devices() itself returns false. In turn, it unnecessarily escalates in our case in scsi_eh_ready_devs() beyond host reset to finally scsi_eh_offline_sdevs() which sets affected SCSI devices offline with the following kernel message: "kernel: sd H:0:T:L: Device offlined - not ready after error recovery" Signed-off-by: Steffen Maier Fixes: ef3eb71d8ba4 ("[SCSI] zfcp: Introduce experimental support for DIF/DIX") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_fsf.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 27ff38f839fc..4efdb7415b03 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -2258,7 +2258,8 @@ int zfcp_fsf_fcp_cmnd(struct scsi_cmnd *scsi_cmnd) fcp_cmnd = (struct fcp_cmnd *) &req->qtcb->bottom.io.fcp_cmnd; zfcp_fc_scsi_to_fcp(fcp_cmnd, scsi_cmnd, 0); - if (scsi_prot_sg_count(scsi_cmnd)) { + if ((scsi_get_prot_op(scsi_cmnd) != SCSI_PROT_NORMAL) && + scsi_prot_sg_count(scsi_cmnd)) { zfcp_qdio_set_data_div(qdio, &req->qdio_req, scsi_prot_sg_count(scsi_cmnd)); retval = zfcp_qdio_sbals_from_sg(qdio, &req->qdio_req, From d0c02c6f3e8589f9276f92b1633629721c13ad20 Mon Sep 17 00:00:00 2001 From: Benjamin Block Date: Fri, 28 Jul 2017 12:30:52 +0200 Subject: [PATCH 221/546] scsi: zfcp: add handling for FCP_RESID_OVER to the fcp ingress path commit a099b7b1fc1f0418ab8d79ecf98153e1e134656e upstream. Up until now zfcp would just ignore the FCP_RESID_OVER flag in the FCP response IU. When this flag is set, it is possible, in regards to the FCP standard, that the storage-server processes the command normally, up to the point where data is missing and simply ignores those. In this case no CHECK CONDITION would be set, and because we ignored the FCP_RESID_OVER flag we resulted in at least a data loss or even -corruption as a follow-up error, depending on how the applications/layers on top behave. To prevent this, we now set the host-byte of the corresponding scsi_cmnd to DID_ERROR. Other storage-behaviors, where the same condition results in a CHECK CONDITION set in the answer, don't need to be changed as they are handled in the mid-layer already. Following is an example trace record decoded with zfcpdbf from the s390-tools package. We forcefully injected a fc_dl which is one byte too small: Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : rsl_err Request ID : 0x... SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00070000 ^^DID_ERROR SCSI retries : 0x.. SCSI allowed : 0x.. SCSI scribble : 0x... SCSI opcode : 2a000000 00000000 08000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000400 00000001 ^^fr_flags==FCP_RESID_OVER ^^fr_status==SAM_STAT_GOOD ^^^^^^^^fr_resid 00000000 00000000 As of now, we don't actively handle to possibility that a response IU has both flags - FCP_RESID_OVER and FCP_RESID_UNDER - set at once. Reported-by: Luke M. Hopkins Reviewed-by: Steffen Maier Fixes: 553448f6c483 ("[SCSI] zfcp: Message cleanup") Fixes: ea127f975424 ("[PATCH] s390 (7/7): zfcp host adapter.") (tglx/history.git) Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_fc.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_fc.h b/drivers/s390/scsi/zfcp_fc.h index df2b541c8287..a2275825186f 100644 --- a/drivers/s390/scsi/zfcp_fc.h +++ b/drivers/s390/scsi/zfcp_fc.h @@ -4,7 +4,7 @@ * Fibre Channel related definitions and inline functions for the zfcp * device driver * - * Copyright IBM Corp. 2009 + * Copyright IBM Corp. 2009, 2017 */ #ifndef ZFCP_FC_H @@ -279,6 +279,10 @@ void zfcp_fc_eval_fcp_rsp(struct fcp_resp_with_ext *fcp_rsp, !(rsp_flags & FCP_SNS_LEN_VAL) && fcp_rsp->resp.fr_status == SAM_STAT_GOOD) set_host_byte(scsi, DID_ERROR); + } else if (unlikely(rsp_flags & FCP_RESID_OVER)) { + /* FCP_DL was not sufficient for SCSI data length */ + if (fcp_rsp->resp.fr_status == SAM_STAT_GOOD) + set_host_byte(scsi, DID_ERROR); } } From 52661717ee664d349ba789cac386e8b046f8ed79 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:53 +0200 Subject: [PATCH 222/546] scsi: zfcp: fix capping of unsuccessful GPN_FT SAN response trace records commit 975171b4461be296a35e83ebd748946b81cf0635 upstream. v4.9 commit aceeffbb59bb ("zfcp: trace full payload of all SAN records (req,resp,iels)") fixed trace data loss of 2.6.38 commit 2c55b750a884 ("[SCSI] zfcp: Redesign of the debug tracing for SAN records.") necessary for problem determination, e.g. to see the currently active zone set during automatic port scan. While it already saves space by not dumping any empty residual entries of the large successful GPN_FT response (4 pages), there are seldom cases where the GPN_FT response is unsuccessful and likely does not have FC_NS_FID_LAST set in fp_flags so we did not cap the trace record. We typically see such case for an initiator WWPN, which is not in any zone. Cap unsuccessful responses to at least the actual basic CT_IU response plus whatever fits the SAN trace record built-in "payload" buffer just in case there's trailing information of which we would at least see the existence and its beginning. In order not to erroneously cap successful responses, we need to swap calling the trace function and setting the CT / ELS status to success (0). Example trace record pair formatted with zfcpdbf: Timestamp : ... Area : SAN Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : fssct_1 Request ID : 0x Destination ID : 0x00fffffc SAN req short : 01000000 fc020000 01720ffc 00000000 00000008 SAN req length : 20 | Timestamp : ... Area : SAN Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 2 Tag : fsscth2 Request ID : 0x Destination ID : 0x00fffffc SAN resp short : 01000000 fc020000 80010000 00090700 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] SAN resp length: 16384 San resp info : 01000000 fc020000 80010000 00090700 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] 00000000 00000000 00000000 00000000 [trailing info] The fix saves all but one of the previously associated 64 PAYload trace record chunks of size 256 bytes each. Signed-off-by: Steffen Maier Fixes: aceeffbb59bb ("zfcp: trace full payload of all SAN records (req,resp,iels)") Fixes: 2c55b750a884 ("[SCSI] zfcp: Redesign of the debug tracing for SAN records.") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.c | 10 +++++++++- drivers/s390/scsi/zfcp_fsf.c | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c index d5bf36ec8a75..31d62ea5fdcd 100644 --- a/drivers/s390/scsi/zfcp_dbf.c +++ b/drivers/s390/scsi/zfcp_dbf.c @@ -3,7 +3,7 @@ * * Debug traces for zfcp. * - * Copyright IBM Corp. 2002, 2016 + * Copyright IBM Corp. 2002, 2017 */ #define KMSG_COMPONENT "zfcp" @@ -447,6 +447,7 @@ static u16 zfcp_dbf_san_res_cap_len_if_gpn_ft(char *tag, struct fc_ct_hdr *reqh = sg_virt(ct_els->req); struct fc_ns_gid_ft *reqn = (struct fc_ns_gid_ft *)(reqh + 1); struct scatterlist *resp_entry = ct_els->resp; + struct fc_ct_hdr *resph; struct fc_gpn_ft_resp *acc; int max_entries, x, last = 0; @@ -473,6 +474,13 @@ static u16 zfcp_dbf_san_res_cap_len_if_gpn_ft(char *tag, return len; /* not GPN_FT response so do not cap */ acc = sg_virt(resp_entry); + + /* cap all but accept CT responses to at least the CT header */ + resph = (struct fc_ct_hdr *)acc; + if ((ct_els->status) || + (resph->ct_cmd != cpu_to_be16(FC_FS_ACC))) + return max(FC_CT_HDR_LEN, ZFCP_DBF_SAN_MAX_PAYLOAD); + max_entries = (reqh->ct_mr_size * 4 / sizeof(struct fc_gpn_ft_resp)) + 1 /* zfcp_fc_scan_ports: bytes correct, entries off-by-one * to account for header as 1st pseudo "entry" */; diff --git a/drivers/s390/scsi/zfcp_fsf.c b/drivers/s390/scsi/zfcp_fsf.c index 4efdb7415b03..1964391db904 100644 --- a/drivers/s390/scsi/zfcp_fsf.c +++ b/drivers/s390/scsi/zfcp_fsf.c @@ -928,8 +928,8 @@ static void zfcp_fsf_send_ct_handler(struct zfcp_fsf_req *req) switch (header->fsf_status) { case FSF_GOOD: - zfcp_dbf_san_res("fsscth2", req); ct->status = 0; + zfcp_dbf_san_res("fsscth2", req); break; case FSF_SERVICE_CLASS_NOT_SUPPORTED: zfcp_fsf_class_not_supp(req); @@ -1109,8 +1109,8 @@ static void zfcp_fsf_send_els_handler(struct zfcp_fsf_req *req) switch (header->fsf_status) { case FSF_GOOD: - zfcp_dbf_san_res("fsselh1", req); send_els->status = 0; + zfcp_dbf_san_res("fsselh1", req); break; case FSF_SERVICE_CLASS_NOT_SUPPORTED: zfcp_fsf_class_not_supp(req); From 1a847369487c31b405e3bda614a36612250905ac Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:54 +0200 Subject: [PATCH 223/546] scsi: zfcp: fix passing fsf_req to SCSI trace on TMF to correlate with HBA commit 9fe5d2b2fd30aa8c7827ec62cbbe6d30df4fe3e3 upstream. Without this fix we get SCSI trace records on task management functions which cannot be correlated to HBA trace records because all fields related to the FSF request are empty (zero). Also, the FCP_RSP_IU is missing as well as any sense data if available. This was caused by v2.6.14 commit 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") introducing trace records for TMFs but hard coding NULL for a possibly existing TMF FSF request. The scsi_cmnd scribble is also zero or unrelated for the TMF request so it also could not lookup a suitable FSF request from there. A broken example trace record formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_fail Request ID : 0x0000000000000000 ^^^^^^^^^^^^^^^^ no correlation to HBA record SCSI ID : 0x SCSI LUN : 0x SCSI result : 0x000e0000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x0000000000000000 SCSI opcode : 2a000017 3bb80000 08000000 00000000 FCP rsp inf cod: 0x00 ^^ no TMF response FCP rsp IU : 00000000 00000000 00000000 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^ no interesting FCP_RSP_IU Sense len : ... ^^^^^^^^^^^^^^^^^^^^ no sense data length Sense info : ... ^^^^^^^^^^^^^^^^^^^^ no sense data content, even if present There are some true cases where we really do not have an FSF request: "rsl_fai" from zfcp_dbf_scsi_fail_send() called for early returns / completions in zfcp_scsi_queuecommand(), "abrt_or", "abrt_bl", "abrt_ru", "abrt_ar" from zfcp_scsi_eh_abort_handler() where we did not get as far, "lr_nres", "tr_nres" from zfcp_task_mgmt_function() where we're successful and do not need to do anything because adapter stopped. For these cases it's correct to pass NULL for fsf_req to _zfcp_dbf_scsi(). Signed-off-by: Steffen Maier Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.h | 7 ++++--- drivers/s390/scsi/zfcp_scsi.c | 8 ++++---- 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h index db186d44cfaf..776d1ac125ff 100644 --- a/drivers/s390/scsi/zfcp_dbf.h +++ b/drivers/s390/scsi/zfcp_dbf.h @@ -2,7 +2,7 @@ * zfcp device driver * debug feature declarations * - * Copyright IBM Corp. 2008, 2016 + * Copyright IBM Corp. 2008, 2017 */ #ifndef ZFCP_DBF_H @@ -401,7 +401,8 @@ void zfcp_dbf_scsi_abort(char *tag, struct scsi_cmnd *scmd, * @flag: indicates type of reset (Target Reset, Logical Unit Reset) */ static inline -void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag) +void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag, + struct zfcp_fsf_req *fsf_req) { char tmp_tag[ZFCP_DBF_TAG_LEN]; @@ -411,7 +412,7 @@ void zfcp_dbf_scsi_devreset(char *tag, struct scsi_cmnd *scmnd, u8 flag) memcpy(tmp_tag, "lr_", 3); memcpy(&tmp_tag[3], tag, 4); - _zfcp_dbf_scsi(tmp_tag, 1, scmnd, NULL); + _zfcp_dbf_scsi(tmp_tag, 1, scmnd, fsf_req); } /** diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index 07ffdbb5107f..ecce7e858af9 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -3,7 +3,7 @@ * * Interface to Linux SCSI midlayer. * - * Copyright IBM Corp. 2002, 2016 + * Copyright IBM Corp. 2002, 2017 */ #define KMSG_COMPONENT "zfcp" @@ -278,7 +278,7 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) if (!(atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_RUNNING)) { - zfcp_dbf_scsi_devreset("nres", scpnt, tm_flags); + zfcp_dbf_scsi_devreset("nres", scpnt, tm_flags, NULL); return SUCCESS; } } @@ -288,10 +288,10 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) wait_for_completion(&fsf_req->completion); if (fsf_req->status & ZFCP_STATUS_FSFREQ_TMFUNCFAILED) { - zfcp_dbf_scsi_devreset("fail", scpnt, tm_flags); + zfcp_dbf_scsi_devreset("fail", scpnt, tm_flags, fsf_req); retval = FAILED; } else { - zfcp_dbf_scsi_devreset("okay", scpnt, tm_flags); + zfcp_dbf_scsi_devreset("okay", scpnt, tm_flags, fsf_req); zfcp_scsi_forget_cmnds(zfcp_sdev, tm_flags); } From d0fbe221b8f13192afffb5fd44024dbfdfb8f656 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:55 +0200 Subject: [PATCH 224/546] scsi: zfcp: fix missing trace records for early returns in TMF eh handlers commit 1a5d999ebfc7bfe28deb48931bb57faa8e4102b6 upstream. For problem determination we need to see that we were in scsi_eh as well as whether and why we were successful or not. The following commits introduced new early returns without adding a trace record: v2.6.35 commit a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh") on fc_block_scsi_eh() returning != 0 which is FAST_IO_FAIL, v2.6.30 commit 63caf367e1c9 ("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp") on not having gotten an FSF request after the maximum number of retry attempts and thus could not issue a TMF and has to return FAILED. Signed-off-by: Steffen Maier Fixes: a1dbfddd02d2 ("[SCSI] zfcp: Pass return code from fc_block_scsi_eh to scsi eh") Fixes: 63caf367e1c9 ("[SCSI] zfcp: Improve reliability of SCSI eh handlers in zfcp") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_scsi.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/s390/scsi/zfcp_scsi.c b/drivers/s390/scsi/zfcp_scsi.c index ecce7e858af9..9bd9b9a29dfc 100644 --- a/drivers/s390/scsi/zfcp_scsi.c +++ b/drivers/s390/scsi/zfcp_scsi.c @@ -273,8 +273,10 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) zfcp_erp_wait(adapter); ret = fc_block_scsi_eh(scpnt); - if (ret) + if (ret) { + zfcp_dbf_scsi_devreset("fiof", scpnt, tm_flags, NULL); return ret; + } if (!(atomic_read(&adapter->status) & ZFCP_STATUS_COMMON_RUNNING)) { @@ -282,8 +284,10 @@ static int zfcp_task_mgmt_function(struct scsi_cmnd *scpnt, u8 tm_flags) return SUCCESS; } } - if (!fsf_req) + if (!fsf_req) { + zfcp_dbf_scsi_devreset("reqf", scpnt, tm_flags, NULL); return FAILED; + } wait_for_completion(&fsf_req->completion); From 7194822422f9ebada58f7fa7db824ee97b949fe2 Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:56 +0200 Subject: [PATCH 225/546] scsi: zfcp: fix payload with full FCP_RSP IU in SCSI trace records commit 12c3e5754c8022a4f2fd1e9f00d19e99ee0d3cc1 upstream. If the FCP_RSP UI has optional parts (FCP_SNS_INFO or FCP_RSP_INFO) and thus does not fit into the fsp_rsp field built into a SCSI trace record, trace the full FCP_RSP UI with all optional parts as payload record instead of just FCP_SNS_INFO as payload and a 1 byte RSP_INFO_CODE part of FCP_RSP_INFO built into the SCSI record. That way we would also get the full FCP_SNS_INFO in case a target would ever send more than min(SCSI_SENSE_BUFFERSIZE==96, ZFCP_DBF_PAY_MAX_REC==256)==96. The mandatory part of FCP_RSP IU is only 24 bytes. PAYload costs at least one full PAY record of 256 bytes anyway. We cap to the hardware response size which is only FSF_FCP_RSP_SIZE==128. So we can just put the whole FCP_RSP IU with any optional parts into PAYload similarly as we do for SAN PAY since v4.9 commit aceeffbb59bb ("zfcp: trace full payload of all SAN records (req,resp,iels)"). This does not cause any additional trace records wasting memory. Decoded trace records were confusing because they showed a hard-coded sense data length of 96 even if the FCP_RSP_IU field FCP_SNS_LEN showed actually less. Since the same commit, we set pl_len for SAN traces to the full length of a request/response even if we cap the corresponding trace. In contrast, here for SCSI traces we set pl_len to the pre-computed length of FCP_RSP IU considering SNS_LEN or RSP_LEN if valid. Nonetheless we trace a hardcoded payload of length FSF_FCP_RSP_SIZE==128 if there were optional parts. This makes it easier for the zfcpdbf tool to format only the relevant part of the long FCP_RSP UI buffer. And any trailing information is still available in the payload trace record just in case. Rename the payload record tag from "fcp_sns" to "fcp_riu" to make the new content explicit to zfcpdbf which can then pick a suitable field name such as "FCP rsp IU all:" instead of "Sense info :" Also, the same zfcpdbf can still be backwards compatible with "fcp_sns". Old example trace record before this fix, formatted with the tool zfcpdbf from s390-tools: Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU id : .. Caller : 0x... Record id : 1 Tag : rsl_err Request id : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00000002 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x SCSI opcode : 00000000 00000000 00000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000202 00000000 ^^==FCP_SNS_LEN_VALID 00000020 00000000 ^^^^^^^^==FCP_SNS_LEN==32 Sense len : 96 <==min(SCSI_SENSE_BUFFERSIZE,ZFCP_DBF_PAY_MAX_REC) Sense info : 70000600 00000018 00000000 29000000 00000400 00000000 00000000 00000000 00000000 00000000 00000000 00000000<==superfluous 00000000 00000000 00000000 00000000<==superfluous 00000000 00000000 00000000 00000000<==superfluous 00000000 00000000 00000000 00000000<==superfluous New example trace records with this fix: Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : rsl_err Request ID : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00000002 SCSI retries : 0x00 SCSI allowed : 0x03 SCSI scribble : 0x SCSI opcode : a30c0112 00000000 02000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000a02 00000200 00000020 00000000 FCP rsp IU len : 56 FCP rsp IU all : 00000000 00000000 00000a02 00000200 ^^=FCP_RESID_UNDER|FCP_SNS_LEN_VALID 00000020 00000000 70000500 00000018 ^^^^^^^^==FCP_SNS_LEN ^^^^^^^^^^^^^^^^^ 00000000 240000cb 00011100 00000000 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ 00000000 00000000 ^^^^^^^^^^^^^^^^^==FCP_SNS_INFO Timestamp : ... Area : SCSI Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : lr_okay Request ID : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x00000000 SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x SCSI opcode : FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000100 00000000 00000000 00000008 FCP rsp IU len : 32 FCP rsp IU all : 00000000 00000000 00000100 00000000 ^^==FCP_RSP_LEN_VALID 00000000 00000008 00000000 00000000 ^^^^^^^^==FCP_RSP_LEN ^^^^^^^^^^^^^^^^^==FCP_RSP_INFO Signed-off-by: Steffen Maier Fixes: 250a1352b95e ("[SCSI] zfcp: Redesign of the debug tracing for SCSI records.") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.c | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c index 31d62ea5fdcd..c801f9782cb2 100644 --- a/drivers/s390/scsi/zfcp_dbf.c +++ b/drivers/s390/scsi/zfcp_dbf.c @@ -572,19 +572,32 @@ void zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *sc, if (fsf) { rec->fsf_req_id = fsf->req_id; + rec->pl_len = FCP_RESP_WITH_EXT; fcp_rsp = (struct fcp_resp_with_ext *) &(fsf->qtcb->bottom.io.fcp_rsp); + /* mandatory parts of FCP_RSP IU in this SCSI record */ memcpy(&rec->fcp_rsp, fcp_rsp, FCP_RESP_WITH_EXT); if (fcp_rsp->resp.fr_flags & FCP_RSP_LEN_VAL) { fcp_rsp_info = (struct fcp_resp_rsp_info *) &fcp_rsp[1]; rec->fcp_rsp_info = fcp_rsp_info->rsp_code; + rec->pl_len += be32_to_cpu(fcp_rsp->ext.fr_rsp_len); } if (fcp_rsp->resp.fr_flags & FCP_SNS_LEN_VAL) { - rec->pl_len = min((u16)SCSI_SENSE_BUFFERSIZE, - (u16)ZFCP_DBF_PAY_MAX_REC); - zfcp_dbf_pl_write(dbf, sc->sense_buffer, rec->pl_len, - "fcp_sns", fsf->req_id); + rec->pl_len += be32_to_cpu(fcp_rsp->ext.fr_sns_len); } + /* complete FCP_RSP IU in associated PAYload record + * but only if there are optional parts + */ + if (fcp_rsp->resp.fr_flags != 0) + zfcp_dbf_pl_write( + dbf, fcp_rsp, + /* at least one full PAY record + * but not beyond hardware response field + */ + min_t(u16, max_t(u16, rec->pl_len, + ZFCP_DBF_PAY_MAX_REC), + FSF_FCP_RSP_SIZE), + "fcp_riu", fsf->req_id); } debug_event(dbf->scsi, level, rec, sizeof(*rec)); From 1e6c640a75d09064a27d7e08524c0b2d8b17190c Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:57 +0200 Subject: [PATCH 226/546] scsi: zfcp: trace HBA FSF response by default on dismiss or timedout late response commit fdb7cee3b9e3c561502e58137a837341f10cbf8b upstream. At the default trace level, we only trace unsuccessful events including FSF responses. zfcp_dbf_hba_fsf_response() only used protocol status and FSF status to decide on an unsuccessful response. However, this is only one of multiple possible sources determining a failed struct zfcp_fsf_req. An FSF request can also "fail" if its response runs into an ERP timeout or if it gets dismissed because a higher level recovery was triggered [trace tags "erscf_1" or "erscf_2" in zfcp_erp_strategy_check_fsfreq()]. FSF requests with ERP timeout are: FSF_QTCB_EXCHANGE_CONFIG_DATA, FSF_QTCB_EXCHANGE_PORT_DATA, FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT or FSF_QTCB_CLOSE_PHYSICAL_PORT for target ports, FSF_QTCB_OPEN_LUN, FSF_QTCB_CLOSE_LUN. One example is slow queue processing which can cause follow-on errors, e.g. FSF_PORT_ALREADY_OPEN after FSF_QTCB_OPEN_PORT_WITH_DID timed out. In order to see the root cause, we need to see late responses even if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. Example trace records formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : fcegpf1 LUN : 0xffffffffffffffff WWPN : 0x D_ID : 0x00 Adapter status : 0x5400050b Port status : 0x41200000 LUN status : 0x00000000 Ready count : 0x00000001 Running count : 0x... ERP want : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT ERP need : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT | Timestamp : ... 30 seconds later Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : ... Record ID : 2 Tag : erscf_2 LUN : 0xffffffffffffffff WWPN : 0x D_ID : 0x00 Adapter status : 0x5400050b Port status : 0x41200000 LUN status : 0x00000000 Request ID : 0x ERP status : 0x10000000 ZFCP_STATUS_ERP_TIMEDOUT ERP step : 0x0800 ZFCP_ERP_STEP_PORT_OPENING ERP action : 0x02 ZFCP_ERP_ACTION_REOPEN_PORT ERP count : 0x00 | Timestamp : ... later than previous record Area : HBA Subarea : 00 Level : 5 > default level => 3 <= default level Exception : - CPU ID : 00 Caller : ... Record ID : 1 Tag : fs_qtcb => fs_rerr Request ID : 0x Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED | ZFCP_STATUS_FSFREQ_CLEANUP FSF cmnd : 0x00000005 FSF sequence no: 0x... FSF issued : ... > 30 seconds ago FSF stat : 0x00000000 FSF_GOOD FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 FSF_PROT_GOOD Prot stat qual : 00000000 00000000 00000000 00000000 Port handle : 0x... LUN handle : 0x00000000 QTCB log length: ... QTCB log info : ... In case of problems detecting that new responses are waiting on the input queue, we sooner or later trigger adapter recovery due to an FSF request timeout (trace tag "fsrth_1"). FSF requests with FSF request timeout are: typically FSF_QTCB_ABORT_FCP_CMND; but theoretically also FSF_QTCB_EXCHANGE_CONFIG_DATA or FSF_QTCB_EXCHANGE_PORT_DATA via sysfs, FSF_QTCB_OPEN_PORT_WITH_DID or FSF_QTCB_CLOSE_PORT for WKA ports, FSF_QTCB_FCP_CMND for task management function (LUN / target reset). One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD because the channel filled in the response via DMA into the request's QTCB. In a theroretical case, inject code can create an erroneous FSF request on purpose. If data router is enabled, it uses deferred error reporting. A READ SCSI command can succeed with FSF_PROT_GOOD, FSF_GOOD, and SAM_STAT_GOOD. But on writing the read data to host memory via DMA, it can still fail, e.g. if an intentionally wrong scatter list does not provide enough space. Rather than getting an unsuccessful response, we get a QDIO activate check which in turn triggers adapter recovery. One or more pending requests can meanwhile have FSF_PROT_GOOD and FSF_GOOD because the channel filled in the response via DMA into the request's QTCB. Example trace records formatted with zfcpdbf from the s390-tools package: Timestamp : ... Area : HBA Subarea : 00 Level : 6 > default level => 3 <= default level Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : fs_norm => fs_rerr Request ID : 0x Request status : 0x00001010 ZFCP_STATUS_FSFREQ_DISMISSED | ZFCP_STATUS_FSFREQ_CLEANUP FSF cmnd : 0x00000001 FSF sequence no: 0x... FSF issued : ... FSF stat : 0x00000000 FSF_GOOD FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 FSF_PROT_GOOD Prot stat qual : ........ ........ 00000000 00000000 Port handle : 0x... LUN handle : 0x... | Timestamp : ... Area : SCSI Subarea : 00 Level : 3 Exception : - CPU ID : .. Caller : ... Record ID : 1 Tag : rsl_err Request ID : 0x SCSI ID : 0x... SCSI LUN : 0x... SCSI result : 0x000e0000 DID_TRANSPORT_DISRUPTED SCSI retries : 0x00 SCSI allowed : 0x05 SCSI scribble : 0x SCSI opcode : 28... Read(10) FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000000 00000000 ^^ SAM_STAT_GOOD 00000000 00000000 Only with luck in both above cases, we could see a follow-on trace record of an unsuccesful event following a successful but late FSF response with FSF_PROT_GOOD and FSF_GOOD. Typically this was the case for I/O requests resulting in a SCSI trace record "rsl_err" with DID_TRANSPORT_DISRUPTED [On ZFCP_STATUS_FSFREQ_DISMISSED, zfcp_fsf_protstatus_eval() sets ZFCP_STATUS_FSFREQ_ERROR seen by the request handler functions as failure]. However, the reason for this follow-on trace was invisible because the corresponding HBA trace record was missing at the default trace level (by default hidden records with tags "fs_norm", "fs_qtcb", or "fs_open"). On adapter recovery, after we had shut down the QDIO queues, we perform unsuccessful pseudo completions with flag ZFCP_STATUS_FSFREQ_DISMISSED for each pending FSF request in zfcp_fsf_req_dismiss_all(). In order to find the root cause, we need to see all pseudo responses even if the channel presented them successfully with FSF_PROT_GOOD and FSF_GOOD. Therefore, check zfcp_fsf_req.status for ZFCP_STATUS_FSFREQ_DISMISSED or ZFCP_STATUS_FSFREQ_ERROR and trace with a new tag "fs_rerr". It does not matter that there are numerous places which set ZFCP_STATUS_FSFREQ_ERROR after the location where we trace an FSF response early. These cases are based on protocol status != FSF_PROT_GOOD or == FSF_PROT_FSF_STATUS_PRESENTED and are thus already traced by default as trace tag "fs_perr" or "fs_ferr" respectively. NB: The trace record with tag "fssrh_1" for status read buffers on dismiss all remains. zfcp_fsf_req_complete() handles this and returns early. All other FSF request types are handled separately and as described above. Signed-off-by: Steffen Maier Fixes: 8a36e4532ea1 ("[SCSI] zfcp: enhancement of zfcp debug features") Fixes: 2e261af84cdb ("[SCSI] zfcp: Only collect FSF/HBA debug data for matching trace levels") Reviewed-by: Benjamin Block Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.h | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h index 776d1ac125ff..8e7f8e6037d2 100644 --- a/drivers/s390/scsi/zfcp_dbf.h +++ b/drivers/s390/scsi/zfcp_dbf.h @@ -323,7 +323,11 @@ void zfcp_dbf_hba_fsf_response(struct zfcp_fsf_req *req) { struct fsf_qtcb *qtcb = req->qtcb; - if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) && + if (unlikely(req->status & (ZFCP_STATUS_FSFREQ_DISMISSED | + ZFCP_STATUS_FSFREQ_ERROR))) { + zfcp_dbf_hba_fsf_resp("fs_rerr", 3, req); + + } else if ((qtcb->prefix.prot_status != FSF_PROT_GOOD) && (qtcb->prefix.prot_status != FSF_PROT_FSF_STATUS_PRESENTED)) { zfcp_dbf_hba_fsf_resp("fs_perr", 1, req); From 4dd6cbbc2191587942c0bcd4630b191a1064487c Mon Sep 17 00:00:00 2001 From: Steffen Maier Date: Fri, 28 Jul 2017 12:30:58 +0200 Subject: [PATCH 227/546] scsi: zfcp: trace high part of "new" 64 bit SCSI LUN commit 5d4a3d0a2ff23799b956e5962b886287614e7fad upstream. Complements debugging aspects of the otherwise functionally complete v3.17 commit 9cb78c16f5da ("scsi: use 64-bit LUNs"). While I don't have access to a target exporting 3 or 4 level LUNs, I did test it by explicitly attaching a non-existent fake 4 level LUN by means of zfcp sysfs attribute "unit_add". In order to see corresponding trace records of otherwise successful events, we had to increase the trace level of area SCSI and HBA to 6. $ echo 6 > /sys/kernel/debug/s390dbf/zfcp_0.0.1880_scsi/level $ echo 6 > /sys/kernel/debug/s390dbf/zfcp_0.0.1880_hba/level $ echo 0x4011402240334044 > \ /sys/bus/ccw/drivers/zfcp/0.0.1880/0x50050763031bd327/unit_add Example output formatted by an updated zfcpdbf from the s390-tools package interspersed with kernel messages at scsi_logging_level=4605: Timestamp : ... Area : REC Subarea : 00 Level : 1 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : scsla_1 LUN : 0x4011402240334044 WWPN : 0x50050763031bd327 D_ID : 0x00...... Adapter status : 0x5400050b Port status : 0x54000001 LUN status : 0x41000000 Ready count : 0x00000001 Running count : 0x00000000 ERP want : 0x01 ERP need : 0x01 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY pass 1 length 36 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY successful with code 0x0 Timestamp : ... Area : HBA Subarea : 00 Level : 6 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : fs_norm Request ID : 0x Request status : 0x00000010 FSF cmnd : 0x00000001 FSF sequence no: 0x... FSF issued : ... FSF stat : 0x00000000 FSF stat qual : 00000000 00000000 00000000 00000000 Prot stat : 0x00000001 Prot stat qual : ........ ........ 00000000 00000000 Port handle : 0x... LUN handle : 0x... | Timestamp : ... Area : SCSI Subarea : 00 Level : 6 Exception : - CPU ID : .. Caller : 0x... Record ID : 1 Tag : rsl_nor Request ID : 0x SCSI ID : 0x00000000 SCSI LUN : 0x40224011 SCSI LUN high : 0x40444033 <======================= SCSI result : 0x00000000 SCSI retries : 0x00 SCSI allowed : 0x03 SCSI scribble : 0x SCSI opcode : 12000000 a4000000 00000000 00000000 FCP rsp inf cod: 0x00 FCP rsp IU : 00000000 00000000 00000000 00000000 00000000 00000000 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY pass 2 length 164 scsi 2:0:0:4630896905707208721: scsi scan: INQUIRY successful with code 0x0 scsi 2:0:0:4630896905707208721: scsi scan: peripheral device type of 31, \ no device added Signed-off-by: Steffen Maier Fixes: 9cb78c16f5da ("scsi: use 64-bit LUNs") Reviewed-by: Benjamin Block Reviewed-by: Jens Remus Signed-off-by: Benjamin Block Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/s390/scsi/zfcp_dbf.c | 2 +- drivers/s390/scsi/zfcp_dbf.h | 4 +++- 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/s390/scsi/zfcp_dbf.c b/drivers/s390/scsi/zfcp_dbf.c index c801f9782cb2..34367d172961 100644 --- a/drivers/s390/scsi/zfcp_dbf.c +++ b/drivers/s390/scsi/zfcp_dbf.c @@ -563,8 +563,8 @@ void zfcp_dbf_scsi(char *tag, int level, struct scsi_cmnd *sc, rec->scsi_retries = sc->retries; rec->scsi_allowed = sc->allowed; rec->scsi_id = sc->device->id; - /* struct zfcp_dbf_scsi needs to be updated to handle 64bit LUNs */ rec->scsi_lun = (u32)sc->device->lun; + rec->scsi_lun_64_hi = (u32)(sc->device->lun >> 32); rec->host_scribble = (unsigned long)sc->host_scribble; memcpy(rec->scsi_opcode, sc->cmnd, diff --git a/drivers/s390/scsi/zfcp_dbf.h b/drivers/s390/scsi/zfcp_dbf.h index 8e7f8e6037d2..b60667c145fd 100644 --- a/drivers/s390/scsi/zfcp_dbf.h +++ b/drivers/s390/scsi/zfcp_dbf.h @@ -204,7 +204,7 @@ enum zfcp_dbf_scsi_id { * @id: unique number of recovery record type * @tag: identifier string specifying the location of initiation * @scsi_id: scsi device id - * @scsi_lun: scsi device logical unit number + * @scsi_lun: scsi device logical unit number, low part of 64 bit, old 32 bit * @scsi_result: scsi result * @scsi_retries: current retry number of scsi request * @scsi_allowed: allowed retries @@ -214,6 +214,7 @@ enum zfcp_dbf_scsi_id { * @host_scribble: LLD specific data attached to SCSI request * @pl_len: length of paload stored as zfcp_dbf_pay * @fsf_rsp: response for fsf request + * @scsi_lun_64_hi: scsi device logical unit number, high part of 64 bit */ struct zfcp_dbf_scsi { u8 id; @@ -230,6 +231,7 @@ struct zfcp_dbf_scsi { u64 host_scribble; u16 pl_len; struct fcp_resp_with_ext fcp_rsp; + u32 scsi_lun_64_hi; } __packed; /** From d9b8f1ccbb8c7acddbe2e2fe0dfff51b8c75d361 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Wed, 23 Aug 2017 04:47:01 -0700 Subject: [PATCH 228/546] scsi: megaraid_sas: Check valid aen class range to avoid kernel panic commit 91b3d9f0069c8307d0b3a4c6843b65a439183318 upstream. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Hannes Reinecke Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 17c440b9d086..42ddc4d592fc 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -5094,6 +5094,14 @@ megasas_register_aen(struct megasas_instance *instance, u32 seq_num, prev_aen.word = le32_to_cpu(instance->aen_cmd->frame->dcmd.mbox.w[1]); + if ((curr_aen.members.class < MFI_EVT_CLASS_DEBUG) || + (curr_aen.members.class > MFI_EVT_CLASS_DEAD)) { + dev_info(&instance->pdev->dev, + "%s %d out of range class %d send by application\n", + __func__, __LINE__, curr_aen.members.class); + return 0; + } + /* * A class whose enum value is smaller is inclusive of all * higher values. If a PROGRESS (= -1) was previously From b4730f456e21ee98cec3e19837ca3e32fe689c65 Mon Sep 17 00:00:00 2001 From: Shivasharan S Date: Wed, 23 Aug 2017 04:47:04 -0700 Subject: [PATCH 229/546] scsi: megaraid_sas: Return pended IOCTLs with cmd_status MFI_STAT_WRONG_STATE in case adapter is dead commit eb3fe263a48b0d27b229c213929c4cb3b1b39a0f upstream. After a kill adapter, since the cmd_status is not set, the IOCTLs will be hung in driver resulting in application hang. Set cmd_status MFI_STAT_WRONG_STATE when completing pended IOCTLs. Signed-off-by: Kashyap Desai Signed-off-by: Shivasharan S Reviewed-by: Hannes Reinecke Reviewed-by: Tomas Henzl Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/megaraid/megaraid_sas_base.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/megaraid/megaraid_sas_base.c b/drivers/scsi/megaraid/megaraid_sas_base.c index 42ddc4d592fc..6835bae33ec4 100644 --- a/drivers/scsi/megaraid/megaraid_sas_base.c +++ b/drivers/scsi/megaraid/megaraid_sas_base.c @@ -1824,9 +1824,12 @@ static void megasas_complete_outstanding_ioctls(struct megasas_instance *instanc if (cmd_fusion->sync_cmd_idx != (u32)ULONG_MAX) { cmd_mfi = instance->cmd_list[cmd_fusion->sync_cmd_idx]; if (cmd_mfi->sync_cmd && - cmd_mfi->frame->hdr.cmd != MFI_CMD_ABORT) + (cmd_mfi->frame->hdr.cmd != MFI_CMD_ABORT)) { + cmd_mfi->frame->hdr.cmd_status = + MFI_STAT_WRONG_STATE; megasas_complete_cmd(instance, cmd_mfi, DID_OK); + } } } } else { From cf22210c66ca1a252633b8ad9055b082727dff67 Mon Sep 17 00:00:00 2001 From: Long Li Date: Mon, 28 Aug 2017 17:43:59 -0700 Subject: [PATCH 230/546] scsi: storvsc: fix memory leak on ring buffer busy commit 0208eeaa650c5c866a3242201678a19e6dc4a14e upstream. When storvsc is sending I/O to Hyper-v, it may allocate a bigger buffer descriptor for large data payload that can't fit into a pre-allocated buffer descriptor. This bigger buffer is freed on return path. If I/O request to Hyper-v fails due to ring buffer busy, the storvsc allocated buffer descriptor should also be freed. [mkp: applied by hand] Fixes: be0cf6ca301c ("scsi: storvsc: Set the tablesize based on the information given by the host") Signed-off-by: Long Li Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/storvsc_drv.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index cd5c1c060481..6df2841cb7f9 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -1511,6 +1511,8 @@ static int storvsc_queuecommand(struct Scsi_Host *host, struct scsi_cmnd *scmnd) ret = storvsc_do_io(dev, cmd_request); if (ret == -EAGAIN) { + if (payload_sz > sizeof(cmd_request->mpb)) + kfree(payload); /* no more space */ return SCSI_MLQUEUE_DEVICE_BUSY; } From 6b498ad144728628c541dbb703d14142c10311a4 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Apr 2017 09:34:13 +0200 Subject: [PATCH 231/546] scsi: sg: remove 'save_scat_len' commit 136e57bf43dc4babbfb8783abbf707d483cacbe3 upstream. Unused. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 71325972e503..0e3a694f08fa 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -157,7 +157,6 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ int timeout; /* defaults to SG_DEFAULT_TIMEOUT */ int timeout_user; /* defaults to SG_DEFAULT_TIMEOUT_USER */ Sg_scatter_hold reserve; /* buffer held for this file descriptor */ - unsigned save_scat_len; /* original length of trunc. scat. element */ Sg_request *headrp; /* head of request slist, NULL->empty */ struct fasync_struct *async_qp; /* used by asynchronous notification */ Sg_request req_arr[SG_MAX_QUEUE]; /* used as singly-linked list */ @@ -2059,7 +2058,6 @@ sg_unlink_reserve(Sg_fd * sfp, Sg_request * srp) req_schp->pages = NULL; req_schp->page_order = 0; req_schp->sglist_len = 0; - sfp->save_scat_len = 0; srp->res_used = 0; /* Called without mutex lock to avoid deadlock */ sfp->res_in_use = 0; From 3682e0c61ffb4fccd1a86bd2af3cbdd23723b9ed Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 7 Apr 2017 09:34:16 +0200 Subject: [PATCH 232/546] scsi: sg: use standard lists for sg_requests commit 109bade9c625c89bb5ea753aaa1a0a97e6fbb548 upstream. 'Sg_request' is using a private list implementation; convert it to standard lists. Signed-off-by: Hannes Reinecke Reviewed-by: Johannes Thumshirn Tested-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 147 +++++++++++++++++++--------------------------- 1 file changed, 61 insertions(+), 86 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 0e3a694f08fa..ff84f4094fca 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -133,7 +133,7 @@ struct sg_device; /* forward declarations */ struct sg_fd; typedef struct sg_request { /* SG_MAX_QUEUE requests outstanding per file */ - struct sg_request *nextrp; /* NULL -> tail request (slist) */ + struct list_head entry; /* list entry */ struct sg_fd *parentfp; /* NULL -> not in use */ Sg_scatter_hold data; /* hold buffer, perhaps scatter list */ sg_io_hdr_t header; /* scsi command+info, see */ @@ -157,7 +157,7 @@ typedef struct sg_fd { /* holds the state of a file descriptor */ int timeout; /* defaults to SG_DEFAULT_TIMEOUT */ int timeout_user; /* defaults to SG_DEFAULT_TIMEOUT_USER */ Sg_scatter_hold reserve; /* buffer held for this file descriptor */ - Sg_request *headrp; /* head of request slist, NULL->empty */ + struct list_head rq_list; /* head of request list */ struct fasync_struct *async_qp; /* used by asynchronous notification */ Sg_request req_arr[SG_MAX_QUEUE]; /* used as singly-linked list */ char low_dma; /* as in parent but possibly overridden to 1 */ @@ -950,7 +950,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) if (!access_ok(VERIFY_WRITE, ip, sizeof (int))) return -EFAULT; read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp; srp; srp = srp->nextrp) { + list_for_each_entry(srp, &sfp->rq_list, entry) { if ((1 == srp->done) && (!srp->sg_io_owned)) { read_unlock_irqrestore(&sfp->rq_list_lock, iflags); @@ -963,7 +963,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return 0; case SG_GET_NUM_WAITING: read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (val = 0, srp = sfp->headrp; srp; srp = srp->nextrp) { + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { if ((1 == srp->done) && (!srp->sg_io_owned)) ++val; } @@ -1038,35 +1039,33 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp, val = 0; val < SG_MAX_QUEUE; - ++val, srp = srp ? srp->nextrp : srp) { + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { + if (val > SG_MAX_QUEUE) + break; memset(&rinfo[val], 0, SZ_SG_REQ_INFO); - if (srp) { - rinfo[val].req_state = srp->done + 1; - rinfo[val].problem = - srp->header.masked_status & - srp->header.host_status & - srp->header.driver_status; - if (srp->done) - rinfo[val].duration = - srp->header.duration; - else { - ms = jiffies_to_msecs(jiffies); - rinfo[val].duration = - (ms > srp->header.duration) ? - (ms - srp->header.duration) : 0; - } - rinfo[val].orphan = srp->orphan; - rinfo[val].sg_io_owned = - srp->sg_io_owned; - rinfo[val].pack_id = - srp->header.pack_id; - rinfo[val].usr_ptr = - srp->header.usr_ptr; + rinfo[val].req_state = srp->done + 1; + rinfo[val].problem = + srp->header.masked_status & + srp->header.host_status & + srp->header.driver_status; + if (srp->done) + rinfo[val].duration = + srp->header.duration; + else { + ms = jiffies_to_msecs(jiffies); + rinfo[val].duration = + (ms > srp->header.duration) ? + (ms - srp->header.duration) : 0; } + rinfo[val].orphan = srp->orphan; + rinfo[val].sg_io_owned = srp->sg_io_owned; + rinfo[val].pack_id = srp->header.pack_id; + rinfo[val].usr_ptr = srp->header.usr_ptr; + val++; } read_unlock_irqrestore(&sfp->rq_list_lock, iflags); - result = __copy_to_user(p, rinfo, + result = __copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); result = result ? -EFAULT : 0; kfree(rinfo); @@ -1172,7 +1171,7 @@ sg_poll(struct file *filp, poll_table * wait) return POLLERR; poll_wait(filp, &sfp->read_wait, wait); read_lock_irqsave(&sfp->rq_list_lock, iflags); - for (srp = sfp->headrp; srp; srp = srp->nextrp) { + list_for_each_entry(srp, &sfp->rq_list, entry) { /* if any read waiting, flag it */ if ((0 == res) && (1 == srp->done) && (!srp->sg_io_owned)) res = POLLIN | POLLRDNORM; @@ -2070,7 +2069,7 @@ sg_get_rq_mark(Sg_fd * sfp, int pack_id) unsigned long iflags; write_lock_irqsave(&sfp->rq_list_lock, iflags); - for (resp = sfp->headrp; resp; resp = resp->nextrp) { + list_for_each_entry(resp, &sfp->rq_list, entry) { /* look for requests that are ready + not SG_IO owned */ if ((1 == resp->done) && (!resp->sg_io_owned) && ((-1 == pack_id) || (resp->header.pack_id == pack_id))) { @@ -2088,70 +2087,45 @@ sg_add_request(Sg_fd * sfp) { int k; unsigned long iflags; - Sg_request *resp; Sg_request *rp = sfp->req_arr; write_lock_irqsave(&sfp->rq_list_lock, iflags); - resp = sfp->headrp; - if (!resp) { - memset(rp, 0, sizeof (Sg_request)); - rp->parentfp = sfp; - resp = rp; - sfp->headrp = resp; - } else { - if (0 == sfp->cmd_q) - resp = NULL; /* command queuing disallowed */ - else { - for (k = 0; k < SG_MAX_QUEUE; ++k, ++rp) { - if (!rp->parentfp) - break; - } - if (k < SG_MAX_QUEUE) { - memset(rp, 0, sizeof (Sg_request)); - rp->parentfp = sfp; - while (resp->nextrp) - resp = resp->nextrp; - resp->nextrp = rp; - resp = rp; - } else - resp = NULL; + if (!list_empty(&sfp->rq_list)) { + if (!sfp->cmd_q) + goto out_unlock; + + for (k = 0; k < SG_MAX_QUEUE; ++k, ++rp) { + if (!rp->parentfp) + break; } + if (k >= SG_MAX_QUEUE) + goto out_unlock; } - if (resp) { - resp->nextrp = NULL; - resp->header.duration = jiffies_to_msecs(jiffies); - } + memset(rp, 0, sizeof (Sg_request)); + rp->parentfp = sfp; + rp->header.duration = jiffies_to_msecs(jiffies); + list_add_tail(&rp->entry, &sfp->rq_list); write_unlock_irqrestore(&sfp->rq_list_lock, iflags); - return resp; + return rp; +out_unlock: + write_unlock_irqrestore(&sfp->rq_list_lock, iflags); + return NULL; } /* Return of 1 for found; 0 for not found */ static int sg_remove_request(Sg_fd * sfp, Sg_request * srp) { - Sg_request *prev_rp; - Sg_request *rp; unsigned long iflags; int res = 0; - if ((!sfp) || (!srp) || (!sfp->headrp)) + if (!sfp || !srp || list_empty(&sfp->rq_list)) return res; write_lock_irqsave(&sfp->rq_list_lock, iflags); - prev_rp = sfp->headrp; - if (srp == prev_rp) { - sfp->headrp = prev_rp->nextrp; - prev_rp->parentfp = NULL; + if (!list_empty(&srp->entry)) { + list_del(&srp->entry); + srp->parentfp = NULL; res = 1; - } else { - while ((rp = prev_rp->nextrp)) { - if (srp == rp) { - prev_rp->nextrp = rp->nextrp; - rp->parentfp = NULL; - res = 1; - break; - } - prev_rp = rp; - } } write_unlock_irqrestore(&sfp->rq_list_lock, iflags); return res; @@ -2170,7 +2144,7 @@ sg_add_sfp(Sg_device * sdp) init_waitqueue_head(&sfp->read_wait); rwlock_init(&sfp->rq_list_lock); - + INIT_LIST_HEAD(&sfp->rq_list); kref_init(&sfp->f_ref); mutex_init(&sfp->f_mutex); sfp->timeout = SG_DEFAULT_TIMEOUT; @@ -2211,10 +2185,13 @@ sg_remove_sfp_usercontext(struct work_struct *work) { struct sg_fd *sfp = container_of(work, struct sg_fd, ew.work); struct sg_device *sdp = sfp->parentdp; + Sg_request *srp; /* Cleanup any responses which were never read(). */ - while (sfp->headrp) - sg_finish_rem_req(sfp->headrp); + while (!list_empty(&sfp->rq_list)) { + srp = list_first_entry(&sfp->rq_list, Sg_request, entry); + sg_finish_rem_req(srp); + } if (sfp->reserve.bufflen > 0) { SCSI_LOG_TIMEOUT(6, sg_printk(KERN_INFO, sdp, @@ -2617,7 +2594,7 @@ static int sg_proc_seq_show_devstrs(struct seq_file *s, void *v) /* must be called while holding sg_index_lock */ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) { - int k, m, new_interface, blen, usg; + int k, new_interface, blen, usg; Sg_request *srp; Sg_fd *fp; const sg_io_hdr_t *hp; @@ -2637,13 +2614,11 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) seq_printf(s, " cmd_q=%d f_packid=%d k_orphan=%d closed=0\n", (int) fp->cmd_q, (int) fp->force_packid, (int) fp->keep_orphan); - for (m = 0, srp = fp->headrp; - srp != NULL; - ++m, srp = srp->nextrp) { + list_for_each_entry(srp, &fp->rq_list, entry) { hp = &srp->header; new_interface = (hp->interface_id == '\0') ? 0 : 1; if (srp->res_used) { - if (new_interface && + if (new_interface && (SG_FLAG_MMAP_IO & hp->flags)) cp = " mmap>> "; else @@ -2674,7 +2649,7 @@ static void sg_proc_debug_helper(struct seq_file *s, Sg_device * sdp) seq_printf(s, "ms sgat=%d op=0x%02x\n", usg, (int) srp->data.cmd_opcode); } - if (0 == m) + if (list_empty(&fp->rq_list)) seq_puts(s, " No requests active\n"); read_unlock(&fp->rq_list_lock); } From f0cd701d475038d3078867cad576f6530c065120 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 17 Aug 2017 10:09:54 +0300 Subject: [PATCH 233/546] scsi: sg: off by one in sg_ioctl() commit bd46fc406b30d1db1aff8dabaff8d18bb423fdcf upstream. If "val" is SG_MAX_QUEUE then we are one element beyond the end of the "rinfo" array so the > should be >=. Fixes: 109bade9c625 ("scsi: sg: use standard lists for sg_requests") Signed-off-by: Dan Carpenter Acked-by: Douglas Gilbert Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index ff84f4094fca..f758f80f1c77 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -1041,7 +1041,7 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) read_lock_irqsave(&sfp->rq_list_lock, iflags); val = 0; list_for_each_entry(srp, &sfp->rq_list, entry) { - if (val > SG_MAX_QUEUE) + if (val >= SG_MAX_QUEUE) break; memset(&rinfo[val], 0, SZ_SG_REQ_INFO); rinfo[val].req_state = srp->done + 1; From c04996ad58eefbef5d3aafd340ce64aa54661425 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:15 +0200 Subject: [PATCH 234/546] scsi: sg: factor out sg_fill_request_table() commit 4759df905a474d245752c9dc94288e779b8734dd upstream. Factor out sg_fill_request_table() for better readability. [mkp: typos, applied by hand] Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 61 +++++++++++++++++++++++++++-------------------- 1 file changed, 35 insertions(+), 26 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index f758f80f1c77..525937f9b536 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -839,6 +839,40 @@ static int max_sectors_bytes(struct request_queue *q) return max_sectors << 9; } +static void +sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) +{ + Sg_request *srp; + int val; + unsigned int ms; + + val = 0; + list_for_each_entry(srp, &sfp->rq_list, entry) { + if (val > SG_MAX_QUEUE) + break; + memset(&rinfo[val], 0, SZ_SG_REQ_INFO); + rinfo[val].req_state = srp->done + 1; + rinfo[val].problem = + srp->header.masked_status & + srp->header.host_status & + srp->header.driver_status; + if (srp->done) + rinfo[val].duration = + srp->header.duration; + else { + ms = jiffies_to_msecs(jiffies); + rinfo[val].duration = + (ms > srp->header.duration) ? + (ms - srp->header.duration) : 0; + } + rinfo[val].orphan = srp->orphan; + rinfo[val].sg_io_owned = srp->sg_io_owned; + rinfo[val].pack_id = srp->header.pack_id; + rinfo[val].usr_ptr = srp->header.usr_ptr; + val++; + } +} + static long sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) { @@ -1032,38 +1066,13 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) return -EFAULT; else { sg_req_info_t *rinfo; - unsigned int ms; rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); - val = 0; - list_for_each_entry(srp, &sfp->rq_list, entry) { - if (val >= SG_MAX_QUEUE) - break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); - rinfo[val].req_state = srp->done + 1; - rinfo[val].problem = - srp->header.masked_status & - srp->header.host_status & - srp->header.driver_status; - if (srp->done) - rinfo[val].duration = - srp->header.duration; - else { - ms = jiffies_to_msecs(jiffies); - rinfo[val].duration = - (ms > srp->header.duration) ? - (ms - srp->header.duration) : 0; - } - rinfo[val].orphan = srp->orphan; - rinfo[val].sg_io_owned = srp->sg_io_owned; - rinfo[val].pack_id = srp->header.pack_id; - rinfo[val].usr_ptr = srp->header.usr_ptr; - val++; - } + sg_fill_request_table(sfp, rinfo); read_unlock_irqrestore(&sfp->rq_list_lock, iflags); result = __copy_to_user(p, rinfo, SZ_SG_REQ_INFO * SG_MAX_QUEUE); From 72896ca30a7f6ceb5238714d5761e4ad4521ccc5 Mon Sep 17 00:00:00 2001 From: Hannes Reinecke Date: Fri, 15 Sep 2017 14:05:16 +0200 Subject: [PATCH 235/546] scsi: sg: fixup infoleak when using SG_GET_REQUEST_TABLE commit 3e0097499839e0fe3af380410eababe5a47c4cf9 upstream. When calling SG_GET_REQUEST_TABLE ioctl only a half-filled table is returned; the remaining part will then contain stale kernel memory information. This patch zeroes out the entire table to avoid this issue. Signed-off-by: Hannes Reinecke Reviewed-by: Bart Van Assche Reviewed-by: Christoph Hellwig Reviewed-by: Eric Dumazet Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sg.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 525937f9b536..39e8b5dc23fa 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -850,7 +850,6 @@ sg_fill_request_table(Sg_fd *sfp, sg_req_info_t *rinfo) list_for_each_entry(srp, &sfp->rq_list, entry) { if (val > SG_MAX_QUEUE) break; - memset(&rinfo[val], 0, SZ_SG_REQ_INFO); rinfo[val].req_state = srp->done + 1; rinfo[val].problem = srp->header.masked_status & @@ -1067,8 +1066,8 @@ sg_ioctl(struct file *filp, unsigned int cmd_in, unsigned long arg) else { sg_req_info_t *rinfo; - rinfo = kmalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, - GFP_KERNEL); + rinfo = kzalloc(SZ_SG_REQ_INFO * SG_MAX_QUEUE, + GFP_KERNEL); if (!rinfo) return -ENOMEM; read_lock_irqsave(&sfp->rq_list_lock, iflags); From d8663aa2778965c75b5e75c7948b44f5de601a88 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 30 Aug 2017 16:30:35 +0300 Subject: [PATCH 236/546] scsi: qla2xxx: Fix an integer overflow in sysfs code commit e6f77540c067b48dee10f1e33678415bfcc89017 upstream. The value of "size" comes from the user. When we add "start + size" it could lead to an integer overflow bug. It means we vmalloc() a lot more memory than we had intended. I believe that on 64 bit systems vmalloc() can succeed even if we ask it to allocate huge 4GB buffers. So we would get memory corruption and likely a crash when we call ha->isp_ops->write_optrom() and ->read_optrom(). Only root can trigger this bug. Link: https://bugzilla.kernel.org/show_bug.cgi?id=194061 Fixes: b7cc176c9eb3 ("[SCSI] qla2xxx: Allow region-based flash-part accesses.") Reported-by: shqking Signed-off-by: Dan Carpenter Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/qla2xxx/qla_attr.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/scsi/qla2xxx/qla_attr.c b/drivers/scsi/qla2xxx/qla_attr.c index 1ed85dfc008d..ac12ee844bfc 100644 --- a/drivers/scsi/qla2xxx/qla_attr.c +++ b/drivers/scsi/qla2xxx/qla_attr.c @@ -404,6 +404,8 @@ qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj, return -EINVAL; if (start > ha->optrom_size) return -EINVAL; + if (size > ha->optrom_size - start) + size = ha->optrom_size - start; mutex_lock(&ha->optrom_mutex); switch (val) { @@ -429,8 +431,7 @@ qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj, } ha->optrom_region_start = start; - ha->optrom_region_size = start + size > ha->optrom_size ? - ha->optrom_size - start : size; + ha->optrom_region_size = start + size; ha->optrom_state = QLA_SREADING; ha->optrom_buffer = vmalloc(ha->optrom_region_size); @@ -503,8 +504,7 @@ qla2x00_sysfs_write_optrom_ctl(struct file *filp, struct kobject *kobj, } ha->optrom_region_start = start; - ha->optrom_region_size = start + size > ha->optrom_size ? - ha->optrom_size - start : size; + ha->optrom_region_size = start + size; ha->optrom_state = QLA_SWRITING; ha->optrom_buffer = vmalloc(ha->optrom_region_size); From 753154fcfefe0d8be9c68096e7709326b9ede349 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 12:04:09 -0400 Subject: [PATCH 237/546] ftrace: Fix selftest goto location on error commit 46320a6acc4fb58f04bcf78c4c942cc43b20f986 upstream. In the second iteration of trace_selftest_ops(), the error goto label is wrong in the case where trace_selftest_test_global_cnt is off. In the case of error, it leaks the dynamic ops that was allocated. Fixes: 95950c2e ("ftrace: Add self-tests for multiple function trace users") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace_selftest.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c index b0f86ea77881..ca70d11b8aa7 100644 --- a/kernel/trace/trace_selftest.c +++ b/kernel/trace/trace_selftest.c @@ -272,7 +272,7 @@ static int trace_selftest_ops(struct trace_array *tr, int cnt) goto out_free; if (cnt > 1) { if (trace_selftest_test_global_cnt == 0) - goto out; + goto out_free; } if (trace_selftest_test_dyn_cnt == 0) goto out_free; From d28e96be7c6a2a4310c83c13054475836f6ffbae Mon Sep 17 00:00:00 2001 From: Baohong Liu Date: Tue, 5 Sep 2017 16:57:19 -0500 Subject: [PATCH 238/546] tracing: Apply trace_clock changes to instance max buffer commit 170b3b1050e28d1ba0700e262f0899ffa4fccc52 upstream. Currently trace_clock timestamps are applied to both regular and max buffers only for global trace. For instance trace, trace_clock timestamps are applied only to regular buffer. But, regular and max buffers can be swapped, for example, following a snapshot. So, for instance trace, bad timestamps can be seen following a snapshot. Let's apply trace_clock timestamps to instance max buffer as well. Link: http://lkml.kernel.org/r/ebdb168d0be042dcdf51f81e696b17fabe3609c1.1504642143.git.tom.zanussi@linux.intel.com Fixes: 277ba0446 ("tracing: Add interface to allow multiple trace buffers") Signed-off-by: Baohong Liu Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index d59ebd9d21df..4743066010c4 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -5237,7 +5237,7 @@ static int tracing_set_clock(struct trace_array *tr, const char *clockstr) tracing_reset_online_cpus(&tr->trace_buffer); #ifdef CONFIG_TRACER_MAX_TRACE - if (tr->flags & TRACE_ARRAY_FL_GLOBAL && tr->max_buffer.buffer) + if (tr->max_buffer.buffer) ring_buffer_set_clock(tr->max_buffer.buffer, trace_clocks[i].func); tracing_reset_online_cpus(&tr->max_buffer); #endif From 81306fc3dbb53b11f9c42d31403df3655d50f935 Mon Sep 17 00:00:00 2001 From: Jose Abreu Date: Fri, 1 Sep 2017 17:00:23 +0100 Subject: [PATCH 239/546] ARC: Re-enable MMU upon Machine Check exception commit 1ee55a8f7f6b7ca4c0c59e0b4b4e3584a085c2d3 upstream. I recently came upon a scenario where I would get a double fault machine check exception tiriggered by a kernel module. However the ensuing crash stacktrace (ksym lookup) was not working correctly. Turns out that machine check auto-disables MMU while modules are allocated in kernel vaddr spapce. This patch re-enables the MMU before start printing the stacktrace making stacktracing of modules work upon a fatal exception. Signed-off-by: Jose Abreu Reviewed-by: Alexey Brodkin Signed-off-by: Vineet Gupta [vgupta: moved code into low level handler to avoid in 2 places] Signed-off-by: Greg Kroah-Hartman --- arch/arc/kernel/entry.S | 6 ++++++ arch/arc/mm/tlb.c | 3 --- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/arch/arc/kernel/entry.S b/arch/arc/kernel/entry.S index 2efb0625331d..db1eee5fe502 100644 --- a/arch/arc/kernel/entry.S +++ b/arch/arc/kernel/entry.S @@ -104,6 +104,12 @@ ENTRY(EV_MachineCheck) lr r0, [efa] mov r1, sp + ; hardware auto-disables MMU, re-enable it to allow kernel vaddr + ; access for say stack unwinding of modules for crash dumps + lr r3, [ARC_REG_PID] + or r3, r3, MMU_ENABLE + sr r3, [ARC_REG_PID] + lsr r3, r2, 8 bmsk r3, r3, 7 brne r3, ECR_C_MCHK_DUP_TLB, 1f diff --git a/arch/arc/mm/tlb.c b/arch/arc/mm/tlb.c index daf2bf52b984..97e9582dcf99 100644 --- a/arch/arc/mm/tlb.c +++ b/arch/arc/mm/tlb.c @@ -885,9 +885,6 @@ void do_tlb_overlap_fault(unsigned long cause, unsigned long address, local_irq_save(flags); - /* re-enable the MMU */ - write_aux_reg(ARC_REG_PID, MMU_ENABLE | read_aux_reg(ARC_REG_PID)); - /* loop thru all sets of TLB */ for (set = 0; set < mmu->sets; set++) { From 7498bd6058405ba17df33be06046fefdb419fe00 Mon Sep 17 00:00:00 2001 From: Aleksandr Bezzubikov Date: Tue, 18 Jul 2017 17:12:25 +0300 Subject: [PATCH 240/546] PCI: shpchp: Enable bridge bus mastering if MSI is enabled commit 48b79a14505349a29b3e20f03619ada9b33c4b17 upstream. An SHPC may generate MSIs to notify software about slot or controller events (SHPC spec r1.0, sec 4.7). A PCI device can only generate an MSI if it has bus mastering enabled. Enable bus mastering if the bridge contains an SHPC that uses MSI for event notifications. Signed-off-by: Aleksandr Bezzubikov [bhelgaas: changelog] Signed-off-by: Bjorn Helgaas Reviewed-by: Marcel Apfelbaum Acked-by: Michael S. Tsirkin Signed-off-by: Greg Kroah-Hartman --- drivers/pci/hotplug/shpchp_hpc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/pci/hotplug/shpchp_hpc.c b/drivers/pci/hotplug/shpchp_hpc.c index 7d223e9080ef..77dddee2753a 100644 --- a/drivers/pci/hotplug/shpchp_hpc.c +++ b/drivers/pci/hotplug/shpchp_hpc.c @@ -1062,6 +1062,8 @@ int shpc_init(struct controller *ctrl, struct pci_dev *pdev) if (rc) { ctrl_info(ctrl, "Can't get msi for the hotplug controller\n"); ctrl_info(ctrl, "Use INTx for the hotplug controller\n"); + } else { + pci_set_master(pdev); } rc = request_irq(ctrl->pci_dev->irq, shpc_isr, IRQF_SHARED, From 04affe4e117169e75c4ff1f12dd30d74c9a629fc Mon Sep 17 00:00:00 2001 From: Daniel Mentz Date: Wed, 2 Aug 2017 23:42:17 -0400 Subject: [PATCH 241/546] media: v4l2-compat-ioctl32: Fix timespec conversion commit 9c7ba1d7634cef490b85bc64c4091ff004821bfd upstream. Certain syscalls like recvmmsg support 64 bit timespec values for the X32 ABI. The helper function compat_put_timespec converts a timespec value to a 32 bit or 64 bit value depending on what ABI is used. The v4l2 compat layer, however, is not designed to support 64 bit timespec values and always uses 32 bit values. Hence, compat_put_timespec must not be used. Without this patch, user space will be provided with bad timestamp values from the VIDIOC_DQEVENT ioctl. Also, fields of the struct v4l2_event32 that come immediately after timestamp get overwritten, namely the field named id. Fixes: 81993e81a994 ("compat: Get rid of (get|put)_compat_time(val|spec)") Cc: H. Peter Anvin Cc: Laurent Pinchart Cc: Tiffany Lin Cc: Ricardo Ribalda Delgado Cc: Sakari Ailus Signed-off-by: Daniel Mentz Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/v4l2-core/v4l2-compat-ioctl32.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c index 109f687d1cbd..4379b949bb93 100644 --- a/drivers/media/v4l2-core/v4l2-compat-ioctl32.c +++ b/drivers/media/v4l2-core/v4l2-compat-ioctl32.c @@ -773,7 +773,8 @@ static int put_v4l2_event32(struct v4l2_event *kp, struct v4l2_event32 __user *u copy_to_user(&up->u, &kp->u, sizeof(kp->u)) || put_user(kp->pending, &up->pending) || put_user(kp->sequence, &up->sequence) || - compat_put_timespec(&kp->timestamp, &up->timestamp) || + put_user(kp->timestamp.tv_sec, &up->timestamp.tv_sec) || + put_user(kp->timestamp.tv_nsec, &up->timestamp.tv_nsec) || put_user(kp->id, &up->id) || copy_to_user(up->reserved, kp->reserved, 8 * sizeof(__u32))) return -EFAULT; From 4931578fbeb525e717a7aa96f83f4d85cf48d0b2 Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 8 Aug 2017 08:56:21 -0400 Subject: [PATCH 242/546] media: uvcvideo: Prevent heap overflow when accessing mapped controls commit 7e09f7d5c790278ab98e5f2c22307ebe8ad6e8ba upstream. The size of uvc_control_mapping is user controlled leading to a potential heap overflow in the uvc driver. This adds a check to verify the user provided size fits within the bounds of the defined buffer size. Originally-from: Richard Simmons Signed-off-by: Guenter Roeck Reviewed-by: Laurent Pinchart Signed-off-by: Hans Verkuil Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/usb/uvc/uvc_ctrl.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/media/usb/uvc/uvc_ctrl.c b/drivers/media/usb/uvc/uvc_ctrl.c index 3e59b288b8a8..618e4e2b4207 100644 --- a/drivers/media/usb/uvc/uvc_ctrl.c +++ b/drivers/media/usb/uvc/uvc_ctrl.c @@ -2001,6 +2001,13 @@ int uvc_ctrl_add_mapping(struct uvc_video_chain *chain, goto done; } + /* Validate the user-provided bit-size and offset */ + if (mapping->size > 32 || + mapping->offset + mapping->size > ctrl->info.size * 8) { + ret = -EINVAL; + goto done; + } + list_for_each_entry(map, &ctrl->info.mappings, list) { if (mapping->id == map->id) { uvc_trace(UVC_TRACE_CONTROL, "Can't add mapping '%s', " From 5025da3b532bc5a16ca89670353aa1c89f33be29 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Thu, 7 Sep 2017 01:28:53 +0800 Subject: [PATCH 243/546] bcache: initialize dirty stripes in flash_dev_run() commit 175206cf9ab63161dec74d9cd7f9992e062491f5 upstream. bcache uses a Proportion-Differentiation Controller algorithm to control writeback rate to cached devices. In the PD controller algorithm, dirty stripes of thin flash device should not be counted in, because flash only volumes never write back dirty data. Currently dirty stripe counter for thin flash device is not initialized when the thin flash device starts. Which means the following calculation in PD controller will reference an undefined dirty stripes number, and all cached devices attached to the same cache set where the thin flash device lies on may have an inaccurate writeback rate. This patch calles bch_sectors_dirty_init() in flash_dev_run(), to correctly initialize dirty stripe counter when the thin flash device starts to run. This patch also does following parameter data type change, -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); to call this function conveniently in flash_dev_run(). (Commit log is composed by Coly Li) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/super.c | 3 ++- drivers/md/bcache/writeback.c | 8 ++++---- drivers/md/bcache/writeback.h | 2 +- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 7b5880b8874c..5bf9da324b78 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1023,7 +1023,7 @@ int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) } if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { - bch_sectors_dirty_init(dc); + bch_sectors_dirty_init(&dc->disk); atomic_set(&dc->has_dirty, 1); atomic_inc(&dc->count); bch_writeback_queue(dc); @@ -1227,6 +1227,7 @@ static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) goto err; bcache_device_attach(d, c, u - c->uuids); + bch_sectors_dirty_init(d); bch_flash_dev_request_init(d); add_disk(d->disk); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b9346cd9cda1..5f5da2eeec34 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -488,17 +488,17 @@ static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, return MAP_CONTINUE; } -void bch_sectors_dirty_init(struct cached_dev *dc) +void bch_sectors_dirty_init(struct bcache_device *d) { struct sectors_dirty_init op; bch_btree_op_init(&op.op, -1); - op.inode = dc->disk.id; + op.inode = d->id; - bch_btree_map_keys(&op.op, dc->disk.c, &KEY(op.inode, 0, 0), + bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), sectors_dirty_init_fn, 0); - dc->disk.sectors_dirty_last = bcache_dev_sectors_dirty(&dc->disk); + d->sectors_dirty_last = bcache_dev_sectors_dirty(d); } void bch_cached_dev_writeback_init(struct cached_dev *dc) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 073a042aed24..357658b64794 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -85,7 +85,7 @@ static inline void bch_writeback_add(struct cached_dev *dc) void bcache_dev_sectors_dirty_add(struct cache_set *, unsigned, uint64_t, int); -void bch_sectors_dirty_init(struct cached_dev *dc); +void bch_sectors_dirty_init(struct bcache_device *); void bch_cached_dev_writeback_init(struct cached_dev *); int bch_cached_dev_writeback_start(struct cached_dev *); From 093457f2bd329a2fdb253a68c4fffebb0e3d7786 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 6 Sep 2017 14:25:51 +0800 Subject: [PATCH 244/546] bcache: Fix leak of bdev reference commit 4b758df21ee7081ab41448d21d60367efaa625b3 upstream. If blkdev_get_by_path() in register_bcache() fails, we try to lookup the block device using lookup_bdev() to detect which situation we are in to properly report error. However we never drop the reference returned to us from lookup_bdev(). Fix that. Signed-off-by: Jan Kara Acked-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/super.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 5bf9da324b78..46f4c7ee5560 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1960,6 +1960,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (!IS_ERR(bdev)) + bdput(bdev); if (attr == &ksysfs_register_quiet) goto out; } From 0471f58e18e60dcdbd89f882dc0fa6370b94cd48 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:53 +0800 Subject: [PATCH 245/546] bcache: do not subtract sectors_to_gc for bypassed IO commit 69daf03adef5f7bc13e0ac86b4b8007df1767aab upstream. Since bypassed IOs use no bucket, so do not subtract sectors_to_gc to trigger gc thread. Signed-off-by: tang.junhui Acked-by: Coly Li Reviewed-by: Eric Wheeler Reviewed-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/request.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/md/bcache/request.c b/drivers/md/bcache/request.c index 2410df1c2a05..6c4c7caea693 100644 --- a/drivers/md/bcache/request.c +++ b/drivers/md/bcache/request.c @@ -196,12 +196,12 @@ static void bch_data_insert_start(struct closure *cl) struct data_insert_op *op = container_of(cl, struct data_insert_op, cl); struct bio *bio = op->bio, *n; - if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) - wake_up_gc(op->c); - if (op->bypass) return bch_data_invalidate(cl); + if (atomic_sub_return(bio_sectors(bio), &op->c->sectors_to_gc) < 0) + wake_up_gc(op->c); + /* * Journal writes are marked REQ_FLUSH; if the original write was a * flush, it'll wait on the journal write. From d9c6a28a6a1cf0e0854c8175386898288f7f3ddf Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:56 +0800 Subject: [PATCH 246/546] bcache: correct cache_dirty_target in __update_writeback_rate() commit a8394090a9129b40f9d90dcb7f4a49d60c727ca6 upstream. __update_write_rate() uses a Proportion-Differentiation Controller algorithm to control writeback rate. A dirty target number is used in this PD controller to control writeback rate. A larger target number will make the writeback rate smaller, on the versus, a smaller target number will make the writeback rate larger. bcache uses the following steps to calculate the target number, 1) cache_sectors = all-buckets-of-cache-set * buckets-size 2) cache_dirty_target = cache_sectors * cached-device-writeback_percent 3) target = cache_dirty_target * (sectors-of-cached-device/sectors-of-all-cached-devices-of-this-cache-set) The calculation at step 1) for cache_sectors is incorrect, which does not consider dirty blocks occupied by flash only volume. A flash only volume can be took as a bcache device without cached device. All data sectors allocated for it are persistent on cache device and marked dirty, they are not touched by bcache writeback and garbage collection code. So data blocks of flash only volume should be ignore when calculating cache_sectors of cache set. Current code does not subtract dirty sectors of flash only volume, which results a larger target number from the above 3 steps. And in sequence the cache device's writeback rate is smaller then a correct value, writeback speed is slower on all cached devices. This patch fixes the incorrect slower writeback rate by subtracting dirty sectors of flash only volumes in __update_writeback_rate(). (Commit log composed by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Reviewed-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/writeback.c | 3 ++- drivers/md/bcache/writeback.h | 19 +++++++++++++++++++ 2 files changed, 21 insertions(+), 1 deletion(-) diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 5f5da2eeec34..8513c5e43458 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -21,7 +21,8 @@ static void __update_writeback_rate(struct cached_dev *dc) { struct cache_set *c = dc->disk.c; - uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; + uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - + bcache_flash_devs_sectors_dirty(c); uint64_t cache_dirty_target = div_u64(cache_sectors * dc->writeback_percent, 100); diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 357658b64794..daec4fd782ea 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -14,6 +14,25 @@ static inline uint64_t bcache_dev_sectors_dirty(struct bcache_device *d) return ret; } +static inline uint64_t bcache_flash_devs_sectors_dirty(struct cache_set *c) +{ + uint64_t i, ret = 0; + + mutex_lock(&bch_register_lock); + + for (i = 0; i < c->nr_uuids; i++) { + struct bcache_device *d = c->devices[i]; + + if (!d || !UUID_FLASH_ONLY(&c->uuids[i])) + continue; + ret += bcache_dev_sectors_dirty(d); + } + + mutex_unlock(&bch_register_lock); + + return ret; +} + static inline unsigned offset_to_stripe(struct bcache_device *d, uint64_t offset) { From a6c5e7a0cd0184d3cf4fe29b598931329b237569 Mon Sep 17 00:00:00 2001 From: Tony Asleson Date: Wed, 6 Sep 2017 14:25:57 +0800 Subject: [PATCH 247/546] bcache: Correct return value for sysfs attach errors commit 77fa100f27475d08a569b9d51c17722130f089e7 upstream. If you encounter any errors in bch_cached_dev_attach it will return a negative error code. The variable 'v' which stores the result is unsigned, thus user space sees a very large value returned for bytes written which can cause incorrect user space behavior. Utilize 1 signed variable to use throughout the function to preserve error return capability. Signed-off-by: Tony Asleson Acked-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/sysfs.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/sysfs.c b/drivers/md/bcache/sysfs.c index b3ff57d61dde..4fbb5532f24c 100644 --- a/drivers/md/bcache/sysfs.c +++ b/drivers/md/bcache/sysfs.c @@ -191,7 +191,7 @@ STORE(__cached_dev) { struct cached_dev *dc = container_of(kobj, struct cached_dev, disk.kobj); - unsigned v = size; + ssize_t v = size; struct cache_set *c; struct kobj_uevent_env *env; @@ -226,7 +226,7 @@ STORE(__cached_dev) bch_cached_dev_run(dc); if (attr == &sysfs_cache_mode) { - ssize_t v = bch_read_string_list(buf, bch_cache_modes + 1); + v = bch_read_string_list(buf, bch_cache_modes + 1); if (v < 0) return v; From f522051a84e566b5552b3ba6127184d9cba54120 Mon Sep 17 00:00:00 2001 From: Tang Junhui Date: Wed, 6 Sep 2017 14:25:59 +0800 Subject: [PATCH 248/546] bcache: fix for gc and write-back race commit 9baf30972b5568d8b5bc8b3c46a6ec5b58100463 upstream. gc and write-back get raced (see the email "bcache get stucked" I sended before): gc thread write-back thread | |bch_writeback_thread() |bch_gc_thread() | | |==>read_dirty() |==>bch_btree_gc() | |==>btree_root() //get btree root | | //node write locker | |==>bch_btree_gc_root() | | |==>read_dirty_submit() | |==>write_dirty() | |==>continue_at(cl, | | write_dirty_finish, | | system_wq); | |==>write_dirty_finish()//excute | | //in system_wq | |==>bch_btree_insert() | |==>bch_btree_map_leaf_nodes() | |==>__bch_btree_map_nodes() | |==>btree_root //try to get btree | | //root node read | | //lock | |-----stuck here |==>bch_btree_set_root() |==>bch_journal_meta() |==>bch_journal() |==>journal_try_write() |==>journal_write_unlocked() //journal_full(&c->journal) | //condition satisfied |==>continue_at(cl, journal_write, system_wq); //try to excute | //journal_write in system_wq | //but work queue is excuting | //write_dirty_finish() |==>closure_sync(); //wait journal_write execute | //over and wake up gc, |-------------stuck here |==>release root node write locker This patch alloc a separate work-queue for write-back thread to avoid such race. (Commit log re-organized by Coly Li to pass checkpatch.pl checking) Signed-off-by: Tang Junhui Acked-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/bcache.h | 1 + drivers/md/bcache/super.c | 2 ++ drivers/md/bcache/writeback.c | 9 +++++++-- 3 files changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/md/bcache/bcache.h b/drivers/md/bcache/bcache.h index c3ea03c9a1a8..02619cabda8b 100644 --- a/drivers/md/bcache/bcache.h +++ b/drivers/md/bcache/bcache.h @@ -333,6 +333,7 @@ struct cached_dev { /* Limit number of writeback bios in flight */ struct semaphore in_flight; struct task_struct *writeback_thread; + struct workqueue_struct *writeback_write_wq; struct keybuf writeback_keys; diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 46f4c7ee5560..c5ceea9222ff 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -1056,6 +1056,8 @@ static void cached_dev_free(struct closure *cl) cancel_delayed_work_sync(&dc->writeback_rate_update); if (!IS_ERR_OR_NULL(dc->writeback_thread)) kthread_stop(dc->writeback_thread); + if (dc->writeback_write_wq) + destroy_workqueue(dc->writeback_write_wq); mutex_lock(&bch_register_lock); diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index 8513c5e43458..bbb1dc9e1639 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -191,7 +191,7 @@ static void write_dirty(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty_finish, system_wq); + continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); } static void read_dirty_endio(struct bio *bio) @@ -211,7 +211,7 @@ static void read_dirty_submit(struct closure *cl) closure_bio_submit(&io->bio, cl); - continue_at(cl, write_dirty, system_wq); + continue_at(cl, write_dirty, io->dc->writeback_write_wq); } static void read_dirty(struct cached_dev *dc) @@ -523,6 +523,11 @@ void bch_cached_dev_writeback_init(struct cached_dev *dc) int bch_cached_dev_writeback_start(struct cached_dev *dc) { + dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", + WQ_MEM_RECLAIM, 0); + if (!dc->writeback_write_wq) + return -ENOMEM; + dc->writeback_thread = kthread_create(bch_writeback_thread, dc, "bcache_writeback"); if (IS_ERR(dc->writeback_thread)) From a069d0a43de42e009ec09e76855a1780fbf20938 Mon Sep 17 00:00:00 2001 From: Michael Lyle Date: Wed, 6 Sep 2017 14:26:02 +0800 Subject: [PATCH 249/546] bcache: fix bch_hprint crash and improve output commit 9276717b9e297a62d1151a43d1cd286213f68eb7 upstream. Most importantly, solve a crash where %llu was used to format signed numbers. This would cause a buffer overflow when reading sysfs writeback_rate_debug, as only 20 bytes were allocated for this and %llu writes 20 characters plus a null. Always use the units mechanism rather than having different output paths for simplicity. Also, correct problems with display output where 1.10 was a larger number than 1.09, by multiplying by 10 and then dividing by 1024 instead of dividing by 100. (Remainders of >= 1000 would print as .10). Minor changes: Always display the decimal point instead of trying to omit it based on number of digits shown. Decide what units to use based on 1000 as a threshold, not 1024 (in other words, always print at most 3 digits before the decimal point). Signed-off-by: Michael Lyle Reported-by: Dmitry Yu Okunev Acked-by: Kent Overstreet Reviewed-by: Coly Li Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/md/bcache/util.c | 42 +++++++++++++++++++++++++++++----------- 1 file changed, 31 insertions(+), 11 deletions(-) diff --git a/drivers/md/bcache/util.c b/drivers/md/bcache/util.c index db3ae4c2b223..6c18e3ec3e48 100644 --- a/drivers/md/bcache/util.c +++ b/drivers/md/bcache/util.c @@ -73,24 +73,44 @@ STRTO_H(strtouint, unsigned int) STRTO_H(strtoll, long long) STRTO_H(strtoull, unsigned long long) +/** + * bch_hprint() - formats @v to human readable string for sysfs. + * + * @v - signed 64 bit integer + * @buf - the (at least 8 byte) buffer to format the result into. + * + * Returns the number of bytes used by format. + */ ssize_t bch_hprint(char *buf, int64_t v) { static const char units[] = "?kMGTPEZY"; - char dec[4] = ""; - int u, t = 0; + int u = 0, t; - for (u = 0; v >= 1024 || v <= -1024; u++) { - t = v & ~(~0 << 10); - v >>= 10; - } + uint64_t q; - if (!u) - return sprintf(buf, "%llu", v); + if (v < 0) + q = -v; + else + q = v; - if (v < 100 && v > -100) - snprintf(dec, sizeof(dec), ".%i", t / 100); + /* For as long as the number is more than 3 digits, but at least + * once, shift right / divide by 1024. Keep the remainder for + * a digit after the decimal point. + */ + do { + u++; - return sprintf(buf, "%lli%s%c", v, dec, units[u]); + t = q & ~(~0 << 10); + q >>= 10; + } while (q >= 1000); + + if (v < 0) + /* '-', up to 3 digits, '.', 1 digit, 1 character, null; + * yields 8 bytes. + */ + return sprintf(buf, "-%llu.%i%c", q, t * 10 / 1024, units[u]); + else + return sprintf(buf, "%llu.%i%c", q, t * 10 / 1024, units[u]); } ssize_t bch_snprint_string_list(char *buf, size_t size, const char * const list[], From ed1bf4397d2219d4b9ec2d5517416ba102186650 Mon Sep 17 00:00:00 2001 From: "Steven Rostedt (VMware)" Date: Fri, 1 Sep 2017 12:18:28 -0400 Subject: [PATCH 250/546] ftrace: Fix memleak when unregistering dynamic ops when tracing disabled commit edb096e00724f02db5f6ec7900f3bbd465c6c76f upstream. If function tracing is disabled by the user via the function-trace option or the proc sysctl file, and a ftrace_ops that was allocated on the heap is unregistered, then the shutdown code exits out without doing the proper clean up. This was found via kmemleak and running the ftrace selftests, as one of the tests unregisters with function tracing disabled. # cat kmemleak unreferenced object 0xffffffffa0020000 (size 4096): comm "swapper/0", pid 1, jiffies 4294668889 (age 569.209s) hex dump (first 32 bytes): 55 ff 74 24 10 55 48 89 e5 ff 74 24 18 55 48 89 U.t$.UH...t$.UH. e5 48 81 ec a8 00 00 00 48 89 44 24 50 48 89 4c .H......H.D$PH.L backtrace: [] kmemleak_vmalloc+0x85/0xf0 [] __vmalloc_node_range+0x281/0x3e0 [] module_alloc+0x4f/0x90 [] arch_ftrace_update_trampoline+0x160/0x420 [] ftrace_startup+0xe7/0x300 [] register_ftrace_function+0x72/0x90 [] trace_selftest_ops+0x204/0x397 [] trace_selftest_startup_function+0x394/0x624 [] run_tracer_selftest+0x15c/0x1d7 [] init_trace_selftests+0x75/0x192 [] do_one_initcall+0x90/0x1e2 [] kernel_init_freeable+0x350/0x3fe [] kernel_init+0x13/0x122 [] ret_from_fork+0x2a/0x40 [] 0xffffffffffffffff Fixes: 12cce594fa ("ftrace/x86: Allow !CONFIG_PREEMPT dynamic ops to use allocated trampolines") Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index eba904bae48c..38d73a6e2857 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -2667,13 +2667,14 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (!command || !ftrace_enabled) { /* - * If these are control ops, they still need their - * per_cpu field freed. Since, function tracing is + * If these are dynamic or control ops, they still + * need their data freed. Since, function tracing is * not currently active, we can just free them * without synchronizing all CPUs. */ - if (ops->flags & FTRACE_OPS_FL_CONTROL) - control_ops_free(ops); + if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) + goto free_ops; + return 0; } @@ -2728,6 +2729,7 @@ static int ftrace_shutdown(struct ftrace_ops *ops, int command) if (ops->flags & (FTRACE_OPS_FL_DYNAMIC | FTRACE_OPS_FL_CONTROL)) { schedule_on_each_cpu(ftrace_sync); + free_ops: arch_ftrace_trampoline_free(ops); if (ops->flags & FTRACE_OPS_FL_CONTROL) From 10def3a6779924f7bb130200b8b940caf2914111 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 27 Sep 2017 11:00:37 +0200 Subject: [PATCH 251/546] Linux 4.4.89 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 788d90a0051b..7e4c46b375b3 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 88 +SUBLEVEL = 89 EXTRAVERSION = NAME = Blurry Fish Butt From fcc949a48842c4b05bea7057289fe7e7206a03fc Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Fri, 8 Sep 2017 18:48:33 +0800 Subject: [PATCH 252/546] cifs: release auth_key.response for reconnect. commit f5c4ba816315d3b813af16f5571f86c8d4e897bd upstream. There is a race that cause cifs reconnect in cifs_mount, - cifs_mount - cifs_get_tcp_session - [ start thread cifs_demultiplex_thread - cifs_read_from_socket: -ECONNABORTED - DELAY_WORK smb2_reconnect_server ] - cifs_setup_session - [ smb2_reconnect_server ] auth_key.response was allocated in cifs_setup_session, and will release when the session destoried. So when session re- connect, auth_key.response should be check and released. Tested with my system: CIFS VFS: Free previous auth_key.response = ffff8800320bbf80 A simple auth_key.response allocation call trace: - cifs_setup_session - SMB2_sess_setup - SMB2_sess_auth_rawntlmssp_authenticate - build_ntlmssp_auth_blob - setup_ntlmv2_rsp Signed-off-by: Shu Wang Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/connect.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 53a827c6d8b1..b377aa8f266f 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -4060,6 +4060,14 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); + if (ses->auth_key.response) { + cifs_dbg(VFS, "Free previous auth_key.response = %p\n", + ses->auth_key.response); + kfree(ses->auth_key.response); + ses->auth_key.response = NULL; + ses->auth_key.len = 0; + } + if (server->ops->sess_setup) rc = server->ops->sess_setup(xid, ses, nls_info); From 7d8fbf3db1692fee3e9be29de3cc71fe6f086112 Mon Sep 17 00:00:00 2001 From: Avraham Stern Date: Fri, 18 Aug 2017 15:33:57 +0300 Subject: [PATCH 253/546] mac80211: flush hw_roc_start work before cancelling the ROC commit 6e46d8ce894374fc135c96a8d1057c6af1fef237 upstream. When HW ROC is supported it is possible that after the HW notified that the ROC has started, the ROC was cancelled and another ROC was added while the hw_roc_start worker is waiting on the mutex (since cancelling the ROC and adding another one also holds the same mutex). As a result, the hw_roc_start worker will continue to run after the new ROC is added but before it is actually started by the HW. This may result in notifying userspace that the ROC has started before it actually does, or in case of management tx ROC, in an attempt to tx while not on the right channel. In addition, when the driver will notify mac80211 that the second ROC has started, mac80211 will warn that this ROC has already been notified. Fix this by flushing the hw_roc_start work before cancelling an ROC. Signed-off-by: Avraham Stern Signed-off-by: Luca Coelho Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/mac80211/offchannel.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c index 04401037140e..b6be51940ead 100644 --- a/net/mac80211/offchannel.c +++ b/net/mac80211/offchannel.c @@ -469,6 +469,8 @@ void ieee80211_roc_purge(struct ieee80211_local *local, struct ieee80211_roc_work *roc, *tmp; LIST_HEAD(tmp_list); + flush_work(&local->hw_roc_start); + mutex_lock(&local->mtx); list_for_each_entry_safe(roc, tmp, &local->roc_list, list) { if (sdata && roc->sdata != sdata) From f75c0042f120179aedf005de1da461296cda0308 Mon Sep 17 00:00:00 2001 From: Paul Mackerras Date: Tue, 12 Sep 2017 15:54:14 +1000 Subject: [PATCH 254/546] KVM: PPC: Book3S: Fix race and leak in kvm_vm_ioctl_create_spapr_tce() commit 47c5310a8dbe7c2cb9f0083daa43ceed76c257fa upstream, with part of commit edd03602d97236e8fea13cd76886c576186aa307 folded in. Nixiaoming pointed out that there is a memory leak in kvm_vm_ioctl_create_spapr_tce() if the call to anon_inode_getfd() fails; the memory allocated for the kvmppc_spapr_tce_table struct is not freed, and nor are the pages allocated for the iommu tables. David Hildenbrand pointed out that there is a race in that the function checks early on that there is not already an entry in the stt->iommu_tables list with the same LIOBN, but an entry with the same LIOBN could get added between then and when the new entry is added to the list. This fixes both problems. To simplify things, we now call anon_inode_getfd() before placing the new entry in the list. The check for an existing entry is done while holding the kvm->lock mutex, immediately before adding the new entry to the list. [paulus@ozlabs.org - folded in that part of edd03602d972 ("KVM: PPC: Book3S HV: Protect updates to spapr_tce_tables list", 2017-08-28) which restructured the code that 47c5310a8dbe modified, to avoid a build failure caused by the absence of put_unused_fd(). Also removed the locked memory accounting, since it doesn't exist in this version, and adjusted the commit message.] Fixes: 54738c097163 ("KVM: PPC: Accelerate H_PUT_TCE by implementing it in real mode") Reported-by: Nixiaoming Reported-by: David Hildenbrand Signed-off-by: Paul Mackerras Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/kvm/book3s_64_vio.c | 46 +++++++++++++++++++------------- 1 file changed, 27 insertions(+), 19 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc94dad..3a095670b0c4 100644 --- a/arch/powerpc/kvm/book3s_64_vio.c +++ b/arch/powerpc/kvm/book3s_64_vio.c @@ -101,22 +101,17 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce *args) { struct kvmppc_spapr_tce_table *stt = NULL; + struct kvmppc_spapr_tce_table *siter; long npages; int ret = -ENOMEM; int i; - /* Check this LIOBN hasn't been previously allocated */ - list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) { - if (stt->liobn == args->liobn) - return -EBUSY; - } - npages = kvmppc_stt_npages(args->window_size); stt = kzalloc(sizeof(*stt) + npages * sizeof(struct page *), GFP_KERNEL); if (!stt) - goto fail; + return ret; stt->liobn = args->liobn; stt->window_size = args->window_size; @@ -128,23 +123,36 @@ long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, goto fail; } - kvm_get_kvm(kvm); - mutex_lock(&kvm->lock); - list_add(&stt->list, &kvm->arch.spapr_tce_tables); + + /* Check this LIOBN hasn't been previously allocated */ + ret = 0; + list_for_each_entry(siter, &kvm->arch.spapr_tce_tables, list) { + if (siter->liobn == args->liobn) { + ret = -EBUSY; + break; + } + } + + if (!ret) + ret = anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, + stt, O_RDWR | O_CLOEXEC); + + if (ret >= 0) { + list_add(&stt->list, &kvm->arch.spapr_tce_tables); + kvm_get_kvm(kvm); + } mutex_unlock(&kvm->lock); - return anon_inode_getfd("kvm-spapr-tce", &kvm_spapr_tce_fops, - stt, O_RDWR | O_CLOEXEC); + if (ret >= 0) + return ret; -fail: - if (stt) { - for (i = 0; i < npages; i++) - if (stt->pages[i]) - __free_page(stt->pages[i]); + fail: + for (i = 0; i < npages; i++) + if (stt->pages[i]) + __free_page(stt->pages[i]); - kfree(stt); - } + kfree(stt); return ret; } From 9c5afa726a526b2fb07cf0df922f2a959b3815bf Mon Sep 17 00:00:00 2001 From: Tahsin Erdogan Date: Sun, 17 Sep 2017 03:23:48 -0700 Subject: [PATCH 255/546] tracing: Fix trace_pipe behavior for instance traces commit 75df6e688ccd517e339a7c422ef7ad73045b18a2 upstream. When reading data from trace_pipe, tracing_wait_pipe() performs a check to see if tracing has been turned off after some data was read. Currently, this check always looks at global trace state, but it should be checking the trace instance where trace_pipe is located at. Because of this bug, cat instances/i1/trace_pipe in the following script will immediately exit instead of waiting for data: cd /sys/kernel/debug/tracing echo 0 > tracing_on mkdir -p instances/i1 echo 1 > instances/i1/tracing_on echo 1 > instances/i1/events/sched/sched_process_exec/enable cat instances/i1/trace_pipe Link: http://lkml.kernel.org/r/20170917102348.1615-1-tahsin@google.com Fixes: 10246fa35d4f ("tracing: give easy way to clear trace buffer") Signed-off-by: Tahsin Erdogan Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 4743066010c4..6baf340fa172 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -4701,7 +4701,7 @@ static int tracing_wait_pipe(struct file *filp) * * iter->pos will be 0 if we haven't read anything. */ - if (!tracing_is_on() && iter->pos) + if (!tracer_tracing_is_on(iter->tr) && iter->pos) break; mutex_unlock(&iter->mutex); From 68a4a52899187a8411374b861ae1fed78302fab8 Mon Sep 17 00:00:00 2001 From: Bo Yan Date: Mon, 18 Sep 2017 10:03:35 -0700 Subject: [PATCH 256/546] tracing: Erase irqsoff trace with empty write commit 8dd33bcb7050dd6f8c1432732f930932c9d3a33e upstream. One convenient way to erase trace is "echo > trace". However, this is currently broken if the current tracer is irqsoff tracer. This is because irqsoff tracer use max_buffer as the default trace buffer. Set the max_buffer as the one to be cleared when it's the trace buffer currently in use. Link: http://lkml.kernel.org/r/1505754215-29411-1-git-send-email-byan@nvidia.com Cc: Fixes: 4acd4d00f ("tracing: give easy way to clear trace buffer") Signed-off-by: Bo Yan Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/trace.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index 6baf340fa172..b64f35afee4e 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -3226,11 +3226,17 @@ static int tracing_open(struct inode *inode, struct file *file) /* If this file was open for write, then erase contents */ if ((file->f_mode & FMODE_WRITE) && (file->f_flags & O_TRUNC)) { int cpu = tracing_get_cpu(inode); + struct trace_buffer *trace_buf = &tr->trace_buffer; + +#ifdef CONFIG_TRACER_MAX_TRACE + if (tr->current_trace->print_max) + trace_buf = &tr->max_buffer; +#endif if (cpu == RING_BUFFER_ALL_CPUS) - tracing_reset_online_cpus(&tr->trace_buffer); + tracing_reset_online_cpus(trace_buf); else - tracing_reset(&tr->trace_buffer, cpu); + tracing_reset(trace_buf, cpu); } if (file->f_mode & FMODE_READ) { From d03d1567866e8015db3b7cc706c3659deba500de Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Fri, 25 Aug 2017 10:40:02 -0700 Subject: [PATCH 257/546] md/raid5: fix a race condition in stripe batch commit 3664847d95e60a9a943858b7800f8484669740fc upstream. We have a race condition in below scenario, say have 3 continuous stripes, sh1, sh2 and sh3, sh1 is the stripe_head of sh2 and sh3: CPU1 CPU2 CPU3 handle_stripe(sh3) stripe_add_to_batch_list(sh3) -> lock(sh2, sh3) -> lock batch_lock(sh1) -> add sh3 to batch_list of sh1 -> unlock batch_lock(sh1) clear_batch_ready(sh1) -> lock(sh1) and batch_lock(sh1) -> clear STRIPE_BATCH_READY for all stripes in batch_list -> unlock(sh1) and batch_lock(sh1) ->clear_batch_ready(sh3) -->test_and_clear_bit(STRIPE_BATCH_READY, sh3) --->return 0 as sh->batch == NULL -> sh3->batch_head = sh1 -> unlock (sh2, sh3) In CPU1, handle_stripe will continue handle sh3 even it's in batch stripe list of sh1. By moving sh3->batch_head assignment in to batch_lock, we make it impossible to clear STRIPE_BATCH_READY before batch_head is set. Thanks Stephane for helping debug this tricky issue. Reported-and-tested-by: Stephane Thiell Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid5.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index 5eac08ffc697..dfd07cc1d167 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -818,6 +818,14 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh spin_unlock(&head->batch_head->batch_lock); goto unlock_out; } + /* + * We must assign batch_head of this stripe within the + * batch_lock, otherwise clear_batch_ready of batch head + * stripe could clear BATCH_READY bit of this stripe and + * this stripe->batch_head doesn't get assigned, which + * could confuse clear_batch_ready for this stripe + */ + sh->batch_head = head->batch_head; /* * at this point, head's BATCH_READY could be cleared, but we @@ -825,8 +833,6 @@ static void stripe_add_to_batch_list(struct r5conf *conf, struct stripe_head *sh */ list_add(&sh->batch_list, &head->batch_list); spin_unlock(&head->batch_head->batch_lock); - - sh->batch_head = head->batch_head; } else { head->batch_head = head; sh->batch_head = head->batch_head; From 29854a77f7936feb85194c65d5d873a86f9b01de Mon Sep 17 00:00:00 2001 From: Dennis Yang Date: Wed, 6 Sep 2017 11:02:35 +0800 Subject: [PATCH 258/546] md/raid5: preserve STRIPE_ON_UNPLUG_LIST in break_stripe_batch_list commit 184a09eb9a2fe425e49c9538f1604b05ed33cfef upstream. In release_stripe_plug(), if a stripe_head has its STRIPE_ON_UNPLUG_LIST set, it indicates that this stripe_head is already in the raid5_plug_cb list and release_stripe() would be called instead to drop a reference count. Otherwise, the STRIPE_ON_UNPLUG_LIST bit would be set for this stripe_head and it will get queued into the raid5_plug_cb list. Since break_stripe_batch_list() did not preserve STRIPE_ON_UNPLUG_LIST, A stripe could be re-added to plug list while it is still on that list in the following situation. If stripe_head A is added to another stripe_head B's batch list, in this case A will have its batch_head != NULL and be added into the plug list. After that, stripe_head B gets handled and called break_stripe_batch_list() to reset all the batched stripe_head(including A which is still on the plug list)'s state and reset their batch_head to NULL. Before the plug list gets processed, if there is another write request comes in and get stripe_head A, A will have its batch_head == NULL (cleared by calling break_stripe_batch_list() on B) and be added to plug list once again. Signed-off-by: Dennis Yang Signed-off-by: Shaohua Li Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid5.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index dfd07cc1d167..d55bf85b76ce 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -4264,7 +4264,8 @@ static void break_stripe_batch_list(struct stripe_head *head_sh, set_mask_bits(&sh->state, ~(STRIPE_EXPAND_SYNC_FLAGS | (1 << STRIPE_PREREAD_ACTIVE) | - (1 << STRIPE_DEGRADED)), + (1 << STRIPE_DEGRADED) | + (1 << STRIPE_ON_UNPLUG_LIST)), head_sh->state & (1 << STRIPE_INSYNC)); sh->check_state = head_sh->check_state; From 9d2534917c25a58b67864ea1db930670d48dee75 Mon Sep 17 00:00:00 2001 From: Xin Long Date: Sun, 27 Aug 2017 20:25:26 +0800 Subject: [PATCH 259/546] scsi: scsi_transport_iscsi: fix the issue that iscsi_if_rx doesn't parse nlmsg properly commit c88f0e6b06f4092995688211a631bb436125d77b upstream. ChunYu found a kernel crash by syzkaller: [ 651.617875] kasan: CONFIG_KASAN_INLINE enabled [ 651.618217] kasan: GPF could be caused by NULL-ptr deref or user memory access [ 651.618731] general protection fault: 0000 [#1] SMP KASAN [ 651.621543] CPU: 1 PID: 9539 Comm: scsi Not tainted 4.11.0.cov #32 [ 651.621938] Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 [ 651.622309] task: ffff880117780000 task.stack: ffff8800a3188000 [ 651.622762] RIP: 0010:skb_release_data+0x26c/0x590 [...] [ 651.627260] Call Trace: [ 651.629156] skb_release_all+0x4f/0x60 [ 651.629450] consume_skb+0x1a5/0x600 [ 651.630705] netlink_unicast+0x505/0x720 [ 651.632345] netlink_sendmsg+0xab2/0xe70 [ 651.633704] sock_sendmsg+0xcf/0x110 [ 651.633942] ___sys_sendmsg+0x833/0x980 [ 651.637117] __sys_sendmsg+0xf3/0x240 [ 651.638820] SyS_sendmsg+0x32/0x50 [ 651.639048] entry_SYSCALL_64_fastpath+0x1f/0xc2 It's caused by skb_shared_info at the end of sk_buff was overwritten by ISCSI_KEVENT_IF_ERROR when parsing nlmsg info from skb in iscsi_if_rx. During the loop if skb->len == nlh->nlmsg_len and both are sizeof(*nlh), ev = nlmsg_data(nlh) will acutally get skb_shinfo(SKB) instead and set a new value to skb_shinfo(SKB)->nr_frags by ev->type. This patch is to fix it by checking nlh->nlmsg_len properly there to avoid over accessing sk_buff. Reported-by: ChunYu Wang Signed-off-by: Xin Long Acked-by: Chris Leech Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/scsi_transport_iscsi.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/scsi/scsi_transport_iscsi.c b/drivers/scsi/scsi_transport_iscsi.c index e4b3d8f4fd85..bb4ed7b1f5df 100644 --- a/drivers/scsi/scsi_transport_iscsi.c +++ b/drivers/scsi/scsi_transport_iscsi.c @@ -3697,7 +3697,7 @@ iscsi_if_rx(struct sk_buff *skb) uint32_t group; nlh = nlmsg_hdr(skb); - if (nlh->nlmsg_len < sizeof(*nlh) || + if (nlh->nlmsg_len < sizeof(*nlh) + sizeof(*ev) || skb->len < nlh->nlmsg_len) { break; } From 231c4f646b7767c9b5a52d3dcd55df588112208b Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Tue, 12 Sep 2017 11:03:39 +0200 Subject: [PATCH 260/546] crypto: talitos - Don't provide setkey for non hmac hashing algs. commit 56136631573baa537a15e0012055ffe8cfec1a33 upstream. Today, md5sum fails with error -ENOKEY because a setkey function is set for non hmac hashing algs, see strace output below: mmap(NULL, 378880, PROT_READ, MAP_SHARED, 6, 0) = 0x77f50000 accept(3, 0, NULL) = 7 vmsplice(5, [{"bin/\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"..., 378880}], 1, SPLICE_F_MORE|SPLICE_F_GIFT) = 262144 splice(4, NULL, 7, NULL, 262144, SPLICE_F_MORE) = -1 ENOKEY (Required key not available) write(2, "Generation of hash for file kcap"..., 50) = 50 munmap(0x77f50000, 378880) = 0 This patch ensures that setkey() function is set only for hmac hashing. Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/talitos.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 6a60936b46e0..00772faa5306 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -2770,7 +2770,8 @@ static struct talitos_crypto_alg *talitos_alg_alloc(struct device *dev, t_alg->algt.alg.hash.final = ahash_final; t_alg->algt.alg.hash.finup = ahash_finup; t_alg->algt.alg.hash.digest = ahash_digest; - t_alg->algt.alg.hash.setkey = ahash_setkey; + if (!strncmp(alg->cra_name, "hmac", 4)) + t_alg->algt.alg.hash.setkey = ahash_setkey; t_alg->algt.alg.hash.import = ahash_import; t_alg->algt.alg.hash.export = ahash_export; From 362711d59b0c854431ba7e5a645ee8f65e75b459 Mon Sep 17 00:00:00 2001 From: LEROY Christophe Date: Wed, 13 Sep 2017 12:44:51 +0200 Subject: [PATCH 261/546] crypto: talitos - fix sha224 commit afd62fa26343be6445479e75de9f07092a061459 upstream. Kernel crypto tests report the following error at startup [ 2.752626] alg: hash: Test 4 failed for sha224-talitos [ 2.757907] 00000000: 30 e2 86 e2 e7 8a dd 0d d7 eb 9f d5 83 fe f1 b0 00000010: 2d 5a 6c a5 f9 55 ea fd 0e 72 05 22 This patch fixes it Signed-off-by: Christophe Leroy Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- drivers/crypto/talitos.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/crypto/talitos.c b/drivers/crypto/talitos.c index 00772faa5306..62ce93568e11 100644 --- a/drivers/crypto/talitos.c +++ b/drivers/crypto/talitos.c @@ -1749,9 +1749,9 @@ static int common_nonsnoop_hash(struct talitos_edesc *edesc, req_ctx->swinit = 0; } else { desc->ptr[1] = zero_entry; - /* Indicate next op is not the first. */ - req_ctx->first = 0; } + /* Indicate next op is not the first. */ + req_ctx->first = 0; /* HMAC key */ if (ctx->keylen) From af24e9d8ba1a323cd13c4c962a74d0f2c48abd75 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:36:45 -0700 Subject: [PATCH 262/546] KEYS: fix writing past end of user-supplied buffer in keyring_read() commit e645016abc803dafc75e4b8f6e4118f088900ffb upstream. Userspace can call keyctl_read() on a keyring to get the list of IDs of keys in the keyring. But if the user-supplied buffer is too small, the kernel would write the full list anyway --- which will corrupt whatever userspace memory happened to be past the end of the buffer. Fix it by only filling the space that is available. Fixes: b2a4df200d57 ("KEYS: Expand the capacity of a keyring") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/keyring.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/security/keys/keyring.c b/security/keys/keyring.c index f931ccfeefb0..262ed2a6b360 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -416,7 +416,7 @@ static void keyring_describe(const struct key *keyring, struct seq_file *m) } struct keyring_read_iterator_context { - size_t qty; + size_t buflen; size_t count; key_serial_t __user *buffer; }; @@ -428,9 +428,9 @@ static int keyring_read_iterator(const void *object, void *data) int ret; kenter("{%s,%d},,{%zu/%zu}", - key->type->name, key->serial, ctx->count, ctx->qty); + key->type->name, key->serial, ctx->count, ctx->buflen); - if (ctx->count >= ctx->qty) + if (ctx->count >= ctx->buflen) return 1; ret = put_user(key->serial, ctx->buffer); @@ -465,16 +465,12 @@ static long keyring_read(const struct key *keyring, return 0; /* Calculate how much data we could return */ - ctx.qty = nr_keys * sizeof(key_serial_t); - if (!buffer || !buflen) - return ctx.qty; - - if (buflen > ctx.qty) - ctx.qty = buflen; + return nr_keys * sizeof(key_serial_t); /* Copy the IDs of the subscribed keys into the buffer */ ctx.buffer = (key_serial_t __user *)buffer; + ctx.buflen = buflen; ctx.count = 0; ret = assoc_array_iterate(&keyring->keys, keyring_read_iterator, &ctx); if (ret < 0) { From 539255aea88e47932a98ba7656775cbca4f3d27c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:03 -0700 Subject: [PATCH 263/546] KEYS: prevent creating a different user's keyrings commit 237bbd29f7a049d310d907f4b2716a7feef9abf3 upstream. It was possible for an unprivileged user to create the user and user session keyrings for another user. For example: sudo -u '#3000' sh -c 'keyctl add keyring _uid.4000 "" @u keyctl add keyring _uid_ses.4000 "" @u sleep 15' & sleep 1 sudo -u '#4000' keyctl describe @u sudo -u '#4000' keyctl describe @us This is problematic because these "fake" keyrings won't have the right permissions. In particular, the user who created them first will own them and will have full access to them via the possessor permissions, which can be used to compromise the security of a user's keys: -4: alswrv-----v------------ 3000 0 keyring: _uid.4000 -5: alswrv-----v------------ 3000 0 keyring: _uid_ses.4000 Fix it by marking user and user session keyrings with a flag KEY_FLAG_UID_KEYRING. Then, when searching for a user or user session keyring by name, skip all keyrings that don't have the flag set. Fixes: 69664cf16af4 ("keys: don't generate user and user session keyrings unless they're accessed") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- include/linux/key.h | 2 ++ security/keys/internal.h | 2 +- security/keys/key.c | 2 ++ security/keys/keyring.c | 23 ++++++++++++++--------- security/keys/process_keys.c | 8 ++++++-- 5 files changed, 25 insertions(+), 12 deletions(-) diff --git a/include/linux/key.h b/include/linux/key.h index 66f705243985..dcc115e8dd03 100644 --- a/include/linux/key.h +++ b/include/linux/key.h @@ -177,6 +177,7 @@ struct key { #define KEY_FLAG_TRUSTED_ONLY 9 /* set if keyring only accepts links to trusted keys */ #define KEY_FLAG_BUILTIN 10 /* set if key is builtin */ #define KEY_FLAG_ROOT_CAN_INVAL 11 /* set if key can be invalidated by root without permission */ +#define KEY_FLAG_UID_KEYRING 12 /* set if key is a user or user session keyring */ /* the key type and key description string * - the desc is used to match a key against search criteria @@ -218,6 +219,7 @@ extern struct key *key_alloc(struct key_type *type, #define KEY_ALLOC_QUOTA_OVERRUN 0x0001 /* add to quota, permit even if overrun */ #define KEY_ALLOC_NOT_IN_QUOTA 0x0002 /* not in quota */ #define KEY_ALLOC_TRUSTED 0x0004 /* Key should be flagged as trusted */ +#define KEY_ALLOC_UID_KEYRING 0x0010 /* allocating a user or user session keyring */ extern void key_revoke(struct key *key); extern void key_invalidate(struct key *key); diff --git a/security/keys/internal.h b/security/keys/internal.h index 5105c2c2da75..51ffb9cde073 100644 --- a/security/keys/internal.h +++ b/security/keys/internal.h @@ -136,7 +136,7 @@ extern key_ref_t keyring_search_aux(key_ref_t keyring_ref, extern key_ref_t search_my_process_keyrings(struct keyring_search_context *ctx); extern key_ref_t search_process_keyrings(struct keyring_search_context *ctx); -extern struct key *find_keyring_by_name(const char *name, bool skip_perm_check); +extern struct key *find_keyring_by_name(const char *name, bool uid_keyring); extern int install_user_keyrings(void); extern int install_thread_keyring_to_cred(struct cred *); diff --git a/security/keys/key.c b/security/keys/key.c index 09c10b181881..51d23c623424 100644 --- a/security/keys/key.c +++ b/security/keys/key.c @@ -296,6 +296,8 @@ struct key *key_alloc(struct key_type *type, const char *desc, key->flags |= 1 << KEY_FLAG_IN_QUOTA; if (flags & KEY_ALLOC_TRUSTED) key->flags |= 1 << KEY_FLAG_TRUSTED; + if (flags & KEY_ALLOC_UID_KEYRING) + key->flags |= 1 << KEY_FLAG_UID_KEYRING; #ifdef KEY_DEBUGGING key->magic = KEY_DEBUG_MAGIC; diff --git a/security/keys/keyring.c b/security/keys/keyring.c index 262ed2a6b360..0c8dd4fbe130 100644 --- a/security/keys/keyring.c +++ b/security/keys/keyring.c @@ -961,15 +961,15 @@ key_ref_t find_key_to_update(key_ref_t keyring_ref, /* * Find a keyring with the specified name. * - * All named keyrings in the current user namespace are searched, provided they - * grant Search permission directly to the caller (unless this check is - * skipped). Keyrings whose usage points have reached zero or who have been - * revoked are skipped. + * Only keyrings that have nonzero refcount, are not revoked, and are owned by a + * user in the current user namespace are considered. If @uid_keyring is %true, + * the keyring additionally must have been allocated as a user or user session + * keyring; otherwise, it must grant Search permission directly to the caller. * * Returns a pointer to the keyring with the keyring's refcount having being * incremented on success. -ENOKEY is returned if a key could not be found. */ -struct key *find_keyring_by_name(const char *name, bool skip_perm_check) +struct key *find_keyring_by_name(const char *name, bool uid_keyring) { struct key *keyring; int bucket; @@ -997,10 +997,15 @@ struct key *find_keyring_by_name(const char *name, bool skip_perm_check) if (strcmp(keyring->description, name) != 0) continue; - if (!skip_perm_check && - key_permission(make_key_ref(keyring, 0), - KEY_NEED_SEARCH) < 0) - continue; + if (uid_keyring) { + if (!test_bit(KEY_FLAG_UID_KEYRING, + &keyring->flags)) + continue; + } else { + if (key_permission(make_key_ref(keyring, 0), + KEY_NEED_SEARCH) < 0) + continue; + } /* we've got a match but we might end up racing with * key_cleanup() if the keyring is currently 'dead' diff --git a/security/keys/process_keys.c b/security/keys/process_keys.c index 4ed909142956..7dd050f24261 100644 --- a/security/keys/process_keys.c +++ b/security/keys/process_keys.c @@ -76,7 +76,9 @@ int install_user_keyrings(void) if (IS_ERR(uid_keyring)) { uid_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, NULL); + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, + NULL); if (IS_ERR(uid_keyring)) { ret = PTR_ERR(uid_keyring); goto error; @@ -92,7 +94,9 @@ int install_user_keyrings(void) session_keyring = keyring_alloc(buf, user->uid, INVALID_GID, cred, user_keyring_perm, - KEY_ALLOC_IN_QUOTA, NULL); + KEY_ALLOC_UID_KEYRING | + KEY_ALLOC_IN_QUOTA, + NULL); if (IS_ERR(session_keyring)) { ret = PTR_ERR(session_keyring); goto error_release; From 638b38505045e1090313ff7ed284911870cd29f8 Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 18 Sep 2017 11:37:23 -0700 Subject: [PATCH 264/546] KEYS: prevent KEYCTL_READ on negative key commit 37863c43b2c6464f252862bf2e9768264e961678 upstream. Because keyctl_read_key() looks up the key with no permissions requested, it may find a negatively instantiated key. If the key is also possessed, we went ahead and called ->read() on the key. But the key payload will actually contain the ->reject_error rather than the normal payload. Thus, the kernel oopses trying to read the user_key_payload from memory address (int)-ENOKEY = 0x00000000ffffff82. Fortunately the payload data is stored inline, so it shouldn't be possible to abuse this as an arbitrary memory read primitive... Reproducer: keyctl new_session keyctl request2 user desc '' @s keyctl read $(keyctl show | awk '/user: desc/ {print $1}') It causes a crash like the following: BUG: unable to handle kernel paging request at 00000000ffffff92 IP: user_read+0x33/0xa0 PGD 36a54067 P4D 36a54067 PUD 0 Oops: 0000 [#1] SMP CPU: 0 PID: 211 Comm: keyctl Not tainted 4.14.0-rc1 #337 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 task: ffff90aa3b74c3c0 task.stack: ffff9878c0478000 RIP: 0010:user_read+0x33/0xa0 RSP: 0018:ffff9878c047bee8 EFLAGS: 00010246 RAX: 0000000000000001 RBX: ffff90aa3d7da340 RCX: 0000000000000017 RDX: 0000000000000000 RSI: 00000000ffffff82 RDI: ffff90aa3d7da340 RBP: ffff9878c047bf00 R08: 00000024f95da94f R09: 0000000000000000 R10: 0000000000000001 R11: 0000000000000000 R12: 0000000000000000 R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 FS: 00007f58ece69740(0000) GS:ffff90aa3e200000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000ffffff92 CR3: 0000000036adc001 CR4: 00000000003606f0 Call Trace: keyctl_read_key+0xac/0xe0 SyS_keyctl+0x99/0x120 entry_SYSCALL_64_fastpath+0x1f/0xbe RIP: 0033:0x7f58ec787bb9 RSP: 002b:00007ffc8d401678 EFLAGS: 00000206 ORIG_RAX: 00000000000000fa RAX: ffffffffffffffda RBX: 00007ffc8d402800 RCX: 00007f58ec787bb9 RDX: 0000000000000000 RSI: 00000000174a63ac RDI: 000000000000000b RBP: 0000000000000004 R08: 00007ffc8d402809 R09: 0000000000000020 R10: 0000000000000000 R11: 0000000000000206 R12: 00007ffc8d402800 R13: 00007ffc8d4016e0 R14: 0000000000000000 R15: 0000000000000000 Code: e5 41 55 49 89 f5 41 54 49 89 d4 53 48 89 fb e8 a4 b4 ad ff 85 c0 74 09 80 3d b9 4c 96 00 00 74 43 48 8b b3 20 01 00 00 4d 85 ed <0f> b7 5e 10 74 29 4d 85 e4 74 24 4c 39 e3 4c 89 e2 4c 89 ef 48 RIP: user_read+0x33/0xa0 RSP: ffff9878c047bee8 CR2: 00000000ffffff92 Fixes: 61ea0c0ba904 ("KEYS: Skip key state checks when checking for possession") Signed-off-by: Eric Biggers Signed-off-by: David Howells Signed-off-by: Greg Kroah-Hartman --- security/keys/keyctl.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/security/keys/keyctl.c b/security/keys/keyctl.c index 671709d8610d..a009dc66eb8f 100644 --- a/security/keys/keyctl.c +++ b/security/keys/keyctl.c @@ -738,6 +738,11 @@ long keyctl_read_key(key_serial_t keyid, char __user *buffer, size_t buflen) key = key_ref_to_ptr(key_ref); + if (test_bit(KEY_FLAG_NEGATIVE, &key->flags)) { + ret = -ENOKEY; + goto error2; + } + /* see if we can read it directly */ ret = key_permission(key_ref, KEY_NEED_READ); if (ret == 0) From fe37a445ea3f0e8bd285c5d09fd223059c3b644e Mon Sep 17 00:00:00 2001 From: Tyrel Datwyler Date: Wed, 20 Sep 2017 17:02:52 -0400 Subject: [PATCH 265/546] powerpc/pseries: Fix parent_dn reference leak in add_dt_node() commit b537ca6fede69a281dc524983e5e633d79a10a08 upstream. A reference to the parent device node is held by add_dt_node() for the node to be added. If the call to dlpar_configure_connector() fails add_dt_node() returns ENOENT and that reference is not freed. Add a call to of_node_put(parent_dn) prior to bailing out after a failed dlpar_configure_connector() call. Fixes: 8d5ff320766f ("powerpc/pseries: Make dlpar_configure_connector parent node aware") Signed-off-by: Tyrel Datwyler Signed-off-by: Michael Ellerman Signed-off-by: Greg Kroah-Hartman --- arch/powerpc/platforms/pseries/mobility.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/pseries/mobility.c b/arch/powerpc/platforms/pseries/mobility.c index ceb18d349459..8dd0c8edefd6 100644 --- a/arch/powerpc/platforms/pseries/mobility.c +++ b/arch/powerpc/platforms/pseries/mobility.c @@ -225,8 +225,10 @@ static int add_dt_node(__be32 parent_phandle, __be32 drc_index) return -ENOENT; dn = dlpar_configure_connector(drc_index, parent_dn); - if (!dn) + if (!dn) { + of_node_put(parent_dn); return -ENOENT; + } rc = dlpar_attach_node(dn); if (rc) From c096b31f9d9a6fc479b85ab51b37c44eaf9529cf Mon Sep 17 00:00:00 2001 From: Steve French Date: Mon, 18 Sep 2017 18:18:45 -0500 Subject: [PATCH 266/546] Fix SMB3.1.1 guest authentication to Samba commit 23586b66d84ba3184b8820277f3fc42761640f87 upstream. Samba rejects SMB3.1.1 dialect (vers=3.1.1) negotiate requests from the kernel client due to the two byte pad at the end of the negotiate contexts. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 6c484ddf26a9..7123289787d8 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -361,7 +361,7 @@ assemble_neg_contexts(struct smb2_negotiate_req *req) build_encrypt_ctxt((struct smb2_encryption_neg_context *)pneg_ctxt); req->NegotiateContextOffset = cpu_to_le32(OFFSET_OF_NEG_CONTEXT); req->NegotiateContextCount = cpu_to_le16(2); - inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + 2 + inc_rfc1001_len(req, 4 + sizeof(struct smb2_preauth_neg_context) + sizeof(struct smb2_encryption_neg_context)); /* calculate hash */ } #else From 02ef29f9cbb616bf41900c427830dc8bf3f52d99 Mon Sep 17 00:00:00 2001 From: Steve French Date: Wed, 20 Sep 2017 19:57:18 -0500 Subject: [PATCH 267/546] SMB: Validate negotiate (to protect against downgrade) even if signing off commit 0603c96f3af50e2f9299fa410c224ab1d465e0f9 upstream. As long as signing is supported (ie not a guest user connection) and connection is SMB3 or SMB3.02, then validate negotiate (protect against man in the middle downgrade attacks). We had been doing this only when signing was required, not when signing was just enabled, but this more closely matches recommended SMB3 behavior and is better security. Suggested by Metze. Signed-off-by: Steve French Reviewed-by: Jeremy Allison Acked-by: Stefan Metzmacher Reviewed-by: Ronnie Sahlberg Signed-off-by: Greg Kroah-Hartman --- fs/cifs/smb2pdu.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 7123289787d8..f2ff60e58ec8 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -526,15 +526,22 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) /* * validation ioctl must be signed, so no point sending this if we - * can not sign it. We could eventually change this to selectively + * can not sign it (ie are not known user). Even if signing is not + * required (enabled but not negotiated), in those cases we selectively * sign just this, the first and only signed request on a connection. - * This is good enough for now since a user who wants better security - * would also enable signing on the mount. Having validation of - * negotiate info for signed connections helps reduce attack vectors + * Having validation of negotiate info helps reduce attack vectors. */ - if (tcon->ses->server->sign == false) + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_GUEST) return 0; /* validation requires signing */ + if (tcon->ses->user_name == NULL) { + cifs_dbg(FYI, "Can't validate negotiate: null user mount\n"); + return 0; /* validation requires signing */ + } + + if (tcon->ses->session_flags & SMB2_SESSION_FLAG_IS_NULL) + cifs_dbg(VFS, "Unexpected null user (anonymous) auth flag sent by server\n"); + vneg_inbuf.Capabilities = cpu_to_le32(tcon->ses->server->vals->req_capabilities); memcpy(vneg_inbuf.Guid, tcon->ses->server->client_guid, From 3bb7084cc031ff8c1cdbb1c50cbe3e1940eab268 Mon Sep 17 00:00:00 2001 From: Steve French Date: Fri, 22 Sep 2017 01:40:27 -0500 Subject: [PATCH 268/546] SMB3: Don't ignore O_SYNC/O_DSYNC and O_DIRECT flags commit 1013e760d10e614dc10b5624ce9fc41563ba2e65 upstream. Signed-off-by: Steve French Reviewed-by: Ronnie Sahlberg Reviewed-by: Pavel Shilovsky Signed-off-by: Greg Kroah-Hartman --- fs/cifs/file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/cifs/file.c b/fs/cifs/file.c index a0c0a49b6620..ec2d07bb9beb 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -224,6 +224,13 @@ cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, if (backup_cred(cifs_sb)) create_options |= CREATE_OPEN_BACKUP_INTENT; + /* O_SYNC also has bit for O_DSYNC so following check picks up either */ + if (f_flags & O_SYNC) + create_options |= CREATE_WRITE_THROUGH; + + if (f_flags & O_DIRECT) + create_options |= CREATE_NO_BUFFER; + oparms.tcon = tcon; oparms.cifs_sb = cifs_sb; oparms.desired_access = desired_access; From 3393445ef440e675cb893398e722d31ad04b1ad2 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 25 Sep 2017 12:23:03 +0200 Subject: [PATCH 269/546] vfs: Return -ENXIO for negative SEEK_HOLE / SEEK_DATA offsets commit fc46820b27a2d9a46f7e90c9ceb4a64a1bc5fab8 upstream. In generic_file_llseek_size, return -ENXIO for negative offsets as well as offsets beyond EOF. This affects filesystems which don't implement SEEK_HOLE / SEEK_DATA internally, possibly because they don't support holes. Fixes xfstest generic/448. Signed-off-by: Andreas Gruenbacher Signed-off-by: Linus Torvalds Signed-off-by: Greg Kroah-Hartman --- fs/read_write.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/read_write.c b/fs/read_write.c index 819ef3faf1bb..bfd1a5dddf6e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -112,7 +112,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * In the generic case the entire file is data, so as long as * offset isn't at the end of the file then the offset is data. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; break; case SEEK_HOLE: @@ -120,7 +120,7 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, * There is a virtual hole at the end of the file, so as long as * offset isn't i_size or larger, return i_size. */ - if (offset >= eof) + if ((unsigned long long)offset >= eof) return -ENXIO; offset = eof; break; From 9d74367d1a35e87f46e5e0c2e8dd9f5d21f701b0 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Wed, 13 Sep 2017 00:21:21 +0200 Subject: [PATCH 270/546] nl80211: check for the required netlink attributes presence commit e785fa0a164aa11001cba931367c7f94ffaff888 upstream. nl80211_set_rekey_data() does not check if the required attributes NL80211_REKEY_DATA_{REPLAY_CTR,KEK,KCK} are present when processing NL80211_CMD_SET_REKEY_OFFLOAD request. This request can be issued by users with CAP_NET_ADMIN privilege and may result in NULL dereference and a system crash. Add a check for the required attributes presence. This patch is based on the patch by bo Zhang. This fixes CVE-2017-12153. References: https://bugzilla.redhat.com/show_bug.cgi?id=1491046 Fixes: e5497d766ad ("cfg80211/nl80211: support GTK rekey offload") Reported-by: bo Zhang Signed-off-by: Vladis Dronov Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index de10e3c0e2a4..8ece212aa3d2 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -9786,6 +9786,9 @@ static int nl80211_set_rekey_data(struct sk_buff *skb, struct genl_info *info) if (err) return err; + if (!tb[NL80211_REKEY_DATA_REPLAY_CTR] || !tb[NL80211_REKEY_DATA_KEK] || + !tb[NL80211_REKEY_DATA_KCK]) + return -EINVAL; if (nla_len(tb[NL80211_REKEY_DATA_REPLAY_CTR]) != NL80211_REPLAY_CTR_LEN) return -ERANGE; if (nla_len(tb[NL80211_REKEY_DATA_KEK]) != NL80211_KEK_LEN) From 668cee82cd28d2c38a99f7cbddf3b3fd58f257b9 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Thu, 7 Sep 2017 13:54:35 +0200 Subject: [PATCH 271/546] bsg-lib: don't free job in bsg_prepare_job commit f507b54dccfd8000c517d740bc45f20c74532d18 upstream. The job structure is allocated as part of the request, so we should not free it in the error path of bsg_prepare_job. Signed-off-by: Christoph Hellwig Reviewed-by: Ming Lei Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- block/bsg-lib.c | 1 - 1 file changed, 1 deletion(-) diff --git a/block/bsg-lib.c b/block/bsg-lib.c index 650f427d915b..341b8d858e67 100644 --- a/block/bsg-lib.c +++ b/block/bsg-lib.c @@ -147,7 +147,6 @@ static int bsg_create_job(struct device *dev, struct request *req) failjob_rls_rqst_payload: kfree(job->request_payload.sg_list); failjob_rls_job: - kfree(job); return -ENOMEM; } From 9237605e0bfb0e469b54344a4455e6b241d6c0c9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 27 Sep 2017 09:25:30 -0600 Subject: [PATCH 272/546] seccomp: fix the usage of get/put_seccomp_filter() in seccomp_get_filter() commit 66a733ea6b611aecf0119514d2dddab5f9d6c01e upstream. As Chris explains, get_seccomp_filter() and put_seccomp_filter() can end up using different filters. Once we drop ->siglock it is possible for task->seccomp.filter to have been replaced by SECCOMP_FILTER_FLAG_TSYNC. Fixes: f8e529ed941b ("seccomp, ptrace: add support for dumping seccomp filters") Reported-by: Chris Salls Signed-off-by: Oleg Nesterov [tycho: add __get_seccomp_filter vs. open coding refcount_inc()] Signed-off-by: Tycho Andersen [kees: tweak commit log] Signed-off-by: Kees Cook Signed-off-by: Greg Kroah-Hartman --- kernel/seccomp.c | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/kernel/seccomp.c b/kernel/seccomp.c index 15a1795bbba1..efd384f3f852 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -457,14 +457,19 @@ static long seccomp_attach_filter(unsigned int flags, return 0; } +void __get_seccomp_filter(struct seccomp_filter *filter) +{ + /* Reference count is bounded by the number of total processes. */ + atomic_inc(&filter->usage); +} + /* get_seccomp_filter - increments the reference count of the filter on @tsk */ void get_seccomp_filter(struct task_struct *tsk) { struct seccomp_filter *orig = tsk->seccomp.filter; if (!orig) return; - /* Reference count is bounded by the number of total processes. */ - atomic_inc(&orig->usage); + __get_seccomp_filter(orig); } static inline void seccomp_filter_free(struct seccomp_filter *filter) @@ -475,10 +480,8 @@ static inline void seccomp_filter_free(struct seccomp_filter *filter) } } -/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ -void put_seccomp_filter(struct task_struct *tsk) +static void __put_seccomp_filter(struct seccomp_filter *orig) { - struct seccomp_filter *orig = tsk->seccomp.filter; /* Clean up single-reference branches iteratively. */ while (orig && atomic_dec_and_test(&orig->usage)) { struct seccomp_filter *freeme = orig; @@ -487,6 +490,12 @@ void put_seccomp_filter(struct task_struct *tsk) } } +/* put_seccomp_filter - decrements the ref count of tsk->seccomp.filter */ +void put_seccomp_filter(struct task_struct *tsk) +{ + __put_seccomp_filter(tsk->seccomp.filter); +} + /** * seccomp_send_sigsys - signals the task to allow in-process syscall emulation * @syscall: syscall number to send to userland @@ -927,13 +936,13 @@ long seccomp_get_filter(struct task_struct *task, unsigned long filter_off, if (!data) goto out; - get_seccomp_filter(task); + __get_seccomp_filter(filter); spin_unlock_irq(&task->sighand->siglock); if (copy_to_user(data, fprog->filter, bpf_classic_proglen(fprog))) ret = -EFAULT; - put_seccomp_filter(task); + __put_seccomp_filter(filter); return ret; out: From 638e7874f68208d18d392cdd61ba389e53f6bb0c Mon Sep 17 00:00:00 2001 From: Marc Zyngier Date: Tue, 26 Sep 2017 15:57:16 +0100 Subject: [PATCH 273/546] arm64: Make sure SPsel is always set commit 5371513fb338fb9989c569dc071326d369d6ade8 upstream. When the kernel is entered at EL2 on an ARMv8.0 system, we construct the EL1 pstate and make sure this uses the the EL1 stack pointer (we perform an exception return to EL1h). But if the kernel is either entered at EL1 or stays at EL2 (because we're on a VHE-capable system), we fail to set SPsel, and use whatever stack selection the higher exception level has choosen for us. Let's not take any chance, and make sure that SPsel is set to one before we decide the mode we're going to run in. Acked-by: Mark Rutland Signed-off-by: Marc Zyngier Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/kernel/head.S | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index 20ceb5edf7b8..d019c3a58cc2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -446,6 +446,7 @@ ENDPROC(__mmap_switched) * booted in EL1 or EL2 respectively. */ ENTRY(el2_setup) + msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.ne 1f From e726c30c758b155e0295dc49fdf26cfb8f0d4806 Mon Sep 17 00:00:00 2001 From: Will Deacon Date: Fri, 29 Sep 2017 12:27:41 +0100 Subject: [PATCH 274/546] arm64: fault: Route pte translation faults via do_translation_fault commit 760bfb47c36a07741a089bf6a28e854ffbee7dc9 upstream. We currently route pte translation faults via do_page_fault, which elides the address check against TASK_SIZE before invoking the mm fault handling code. However, this can cause issues with the path walking code in conjunction with our word-at-a-time implementation because load_unaligned_zeropad can end up faulting in kernel space if it reads across a page boundary and runs into a page fault (e.g. by attempting to read from a guard region). In the case of such a fault, load_unaligned_zeropad has registered a fixup to shift the valid data and pad with zeroes, however the abort is reported as a level 3 translation fault and we dispatch it straight to do_page_fault, despite it being a kernel address. This results in calling a sleeping function from atomic context: BUG: sleeping function called from invalid context at arch/arm64/mm/fault.c:313 in_atomic(): 0, irqs_disabled(): 0, pid: 10290 Internal error: Oops - BUG: 0 [#1] PREEMPT SMP [...] [] ___might_sleep+0x134/0x144 [] __might_sleep+0x7c/0x8c [] do_page_fault+0x140/0x330 [] do_mem_abort+0x54/0xb0 Exception stack(0xfffffffb20247a70 to 0xfffffffb20247ba0) [...] [] el1_da+0x18/0x78 [] path_parentat+0x44/0x88 [] filename_parentat+0x5c/0xd8 [] filename_create+0x4c/0x128 [] SyS_mkdirat+0x50/0xc8 [] el0_svc_naked+0x24/0x28 Code: 36380080 d5384100 f9400800 9402566d (d4210000) ---[ end trace 2d01889f2bca9b9f ]--- Fix this by dispatching all translation faults to do_translation_faults, which avoids invoking the page fault logic for faults on kernel addresses. Reported-by: Ankit Jain Signed-off-by: Will Deacon Signed-off-by: Catalin Marinas Signed-off-by: Greg Kroah-Hartman --- arch/arm64/mm/fault.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/arm64/mm/fault.c b/arch/arm64/mm/fault.c index 7fabf49f2aeb..86485415c5f0 100644 --- a/arch/arm64/mm/fault.c +++ b/arch/arm64/mm/fault.c @@ -447,7 +447,7 @@ static struct fault_info { { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, - { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGBUS, 0, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, From 7520be6a454c28955e711fdb49c81519bc537b39 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Thu, 7 Sep 2017 19:02:30 +0100 Subject: [PATCH 275/546] KVM: VMX: Do not BUG() on out-of-bounds guest IRQ MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 3a8b0677fc6180a467e26cc32ce6b0c09a32f9bb upstream. The value of the guest_irq argument to vmx_update_pi_irte() is ultimately coming from a KVM_IRQFD API call. Do not BUG() in vmx_update_pi_irte() if the value is out-of bounds. (Especially, since KVM as a whole seems to hang after that.) Instead, print a message only once if we find that we don't have a route for a certain IRQ (which can be out-of-bounds or within the array). This fixes CVE-2017-1000252. Fixes: efc644048ecde54 ("KVM: x86: Update IRTE for posted-interrupts") Signed-off-by: Jan H. Schönherr Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index b12391119ce8..bd3407a7a9ee 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10755,7 +10755,7 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = -EINVAL; + int idx, ret = 0; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP)) @@ -10763,7 +10763,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, idx = srcu_read_lock(&kvm->irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, &kvm->irq_srcu); - BUG_ON(guest_irq >= irq_rt->nr_rt_entries); + if (guest_irq >= irq_rt->nr_rt_entries || + hlist_empty(&irq_rt->map[guest_irq])) { + pr_warn_once("no route for guest_irq %u/%u (broken user space?)\n", + guest_irq, irq_rt->nr_rt_entries); + goto out; + } hlist_for_each_entry(e, &irq_rt->map[guest_irq], link) { if (e->type != KVM_IRQ_ROUTING_MSI) From 21a638c5efd6ec7a10441bfb94e15e5288920f07 Mon Sep 17 00:00:00 2001 From: Jim Mattson Date: Tue, 12 Sep 2017 13:02:54 -0700 Subject: [PATCH 276/546] kvm: nVMX: Don't allow L2 to access the hardware CR8 commit 51aa68e7d57e3217192d88ce90fd5b8ef29ec94f upstream. If L1 does not specify the "use TPR shadow" VM-execution control in vmcs12, then L0 must specify the "CR8-load exiting" and "CR8-store exiting" VM-execution controls in vmcs02. Failure to do so will give the L2 VM unrestricted read/write access to the hardware CR8. This fixes CVE-2017-12154. Signed-off-by: Jim Mattson Reviewed-by: David Hildenbrand Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index bd3407a7a9ee..ee7ae9e937b2 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -9683,6 +9683,11 @@ static void prepare_vmcs02(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12) vmcs_write64(VIRTUAL_APIC_PAGE_ADDR, page_to_phys(vmx->nested.virtual_apic_page)); vmcs_write32(TPR_THRESHOLD, vmcs12->tpr_threshold); + } else { +#ifdef CONFIG_X86_64 + exec_control |= CPU_BASED_CR8_LOAD_EXITING | + CPU_BASED_CR8_STORE_EXITING; +#endif } if (cpu_has_vmx_msr_bitmap() && From b08dc7d4cfa124961798fb82aaca7e5fd44a7671 Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:40 +0200 Subject: [PATCH 277/546] PCI: Fix race condition with driver_override commit 9561475db680f7144d2223a409dd3d7e322aca03 upstream. The driver_override implementation is susceptible to a race condition when different threads are reading vs. storing a different driver override. Add locking to avoid the race condition. This is in close analogy to commit 6265539776a0 ("driver core: platform: fix race condition with driver_override") from Adrian Salido. Fixes: 782a985d7af2 ("PCI: Introduce new device binding path using pci_dev.driver_override") Signed-off-by: Nicolai Stange Signed-off-by: Bjorn Helgaas Signed-off-by: Greg Kroah-Hartman --- drivers/pci/pci-sysfs.c | 11 +++++++++-- 1 file changed, 9 insertions(+), 2 deletions(-) diff --git a/drivers/pci/pci-sysfs.c b/drivers/pci/pci-sysfs.c index f8b2b5987ea9..ec91cd17bf34 100644 --- a/drivers/pci/pci-sysfs.c +++ b/drivers/pci/pci-sysfs.c @@ -522,7 +522,7 @@ static ssize_t driver_override_store(struct device *dev, const char *buf, size_t count) { struct pci_dev *pdev = to_pci_dev(dev); - char *driver_override, *old = pdev->driver_override, *cp; + char *driver_override, *old, *cp; /* We need to keep extra room for a newline */ if (count >= (PAGE_SIZE - 1)) @@ -536,12 +536,15 @@ static ssize_t driver_override_store(struct device *dev, if (cp) *cp = '\0'; + device_lock(dev); + old = pdev->driver_override; if (strlen(driver_override)) { pdev->driver_override = driver_override; } else { kfree(driver_override); pdev->driver_override = NULL; } + device_unlock(dev); kfree(old); @@ -552,8 +555,12 @@ static ssize_t driver_override_show(struct device *dev, struct device_attribute *attr, char *buf) { struct pci_dev *pdev = to_pci_dev(dev); + ssize_t len; - return snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_lock(dev); + len = snprintf(buf, PAGE_SIZE, "%s\n", pdev->driver_override); + device_unlock(dev); + return len; } static DEVICE_ATTR_RW(driver_override); From 9a7d93dd2cadf532e5a7c490e05c55d44ea4e583 Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 25 Aug 2017 14:15:14 +0900 Subject: [PATCH 278/546] btrfs: fix NULL pointer dereference from free_reloc_roots() commit bb166d7207432d3c7d10c45dc052f12ba3a2121d upstream. __del_reloc_root should be called before freeing up reloc_root->node. If not, calling __del_reloc_root() dereference reloc_root->node, causing the system BUG. Fixes: 6bdf131fac23 ("Btrfs: don't leak reloc root nodes on error") Signed-off-by: Naohiro Aota Reviewed-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/relocation.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 8ca9aa92972d..9ebe027cc4b7 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -2350,11 +2350,11 @@ void free_reloc_roots(struct list_head *list) while (!list_empty(list)) { reloc_root = list_entry(list->next, struct btrfs_root, root_list); + __del_reloc_root(reloc_root); free_extent_buffer(reloc_root->node); free_extent_buffer(reloc_root->commit_root); reloc_root->node = NULL; reloc_root->commit_root = NULL; - __del_reloc_root(reloc_root); } } From 0efde43517a54007c4b19a72d9e3da8a0199074e Mon Sep 17 00:00:00 2001 From: Naohiro Aota Date: Fri, 8 Sep 2017 17:48:55 +0900 Subject: [PATCH 279/546] btrfs: propagate error to btrfs_cmp_data_prepare caller commit 78ad4ce014d025f41b8dde3a81876832ead643cf upstream. btrfs_cmp_data_prepare() (almost) always returns 0 i.e. ignoring errors from gather_extent_pages(). While the pages are freed by btrfs_cmp_data_free(), cmp->num_pages still has > 0. Then, btrfs_extent_same() try to access the already freed pages causing faults (or violates PageLocked assertion). This patch just return the error as is so that the caller stop the process. Signed-off-by: Naohiro Aota Fixes: f441460202cb ("btrfs: fix deadlock with extent-same and readpage") Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 317b99acdf4b..be43d1c5b5fb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -2984,7 +2984,7 @@ static int btrfs_cmp_data_prepare(struct inode *src, u64 loff, out: if (ret) btrfs_cmp_data_free(cmp); - return 0; + return ret; } static int btrfs_cmp_data(struct inode *src, u64 loff, struct inode *dst, From 4c16afac1875db0ad251b1df0e12203302528fcb Mon Sep 17 00:00:00 2001 From: satoru takeuchi Date: Tue, 12 Sep 2017 22:42:52 +0900 Subject: [PATCH 280/546] btrfs: prevent to set invalid default subvolid commit 6d6d282932d1a609e60dc4467677e0e863682f57 upstream. `btrfs sub set-default` succeeds to set an ID which isn't corresponding to any fs/file tree. If such the bad ID is set to a filesystem, we can't mount this filesystem without specifying `subvol` or `subvolid` mount options. Fixes: 6ef5ed0d386b ("Btrfs: add ioctl and incompat flag to set the default mount subvol") Signed-off-by: Satoru Takeuchi Reviewed-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/ioctl.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index be43d1c5b5fb..9c3b9d07f341 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4118,6 +4118,10 @@ static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp) ret = PTR_ERR(new_root); goto out; } + if (!is_fstree(new_root->objectid)) { + ret = -ENOENT; + goto out; + } path = btrfs_alloc_path(); if (!path) { From d25fea066a8ed4457a9f8b23eb78204b9b6896cf Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Mon, 2 Oct 2017 11:04:09 -0700 Subject: [PATCH 281/546] x86/fpu: Don't let userspace set bogus xcomp_bv commit 814fb7bb7db5433757d76f4c4502c96fc53b0b5e upstream. [Please apply to 4.4-stable. Note: the backport includes the fpstate_init() call in xstateregs_set(), since fix is useless without it. It was added by commit 91c3dba7dbc1 ("x86/fpu/xstate: Fix PTRACE frames for XSAVES"), but it doesn't make sense to backport that whole commit.] On x86, userspace can use the ptrace() or rt_sigreturn() system calls to set a task's extended state (xstate) or "FPU" registers. ptrace() can set them for another task using the PTRACE_SETREGSET request with NT_X86_XSTATE, while rt_sigreturn() can set them for the current task. In either case, registers can be set to any value, but the kernel assumes that the XSAVE area itself remains valid in the sense that the CPU can restore it. However, in the case where the kernel is using the uncompacted xstate format (which it does whenever the XSAVES instruction is unavailable), it was possible for userspace to set the xcomp_bv field in the xstate_header to an arbitrary value. However, all bits in that field are reserved in the uncompacted case, so when switching to a task with nonzero xcomp_bv, the XRSTOR instruction failed with a #GP fault. This caused the WARN_ON_FPU(err) in copy_kernel_to_xregs() to be hit. In addition, since the error is otherwise ignored, the FPU registers from the task previously executing on the CPU were leaked. Fix the bug by checking that the user-supplied value of xcomp_bv is 0 in the uncompacted case, and returning an error otherwise. The reason for validating xcomp_bv rather than simply overwriting it with 0 is that we want userspace to see an error if it (incorrectly) provides an XSAVE area in compacted format rather than in uncompacted format. Note that as before, in case of error we clear the task's FPU state. This is perhaps non-ideal, especially for PTRACE_SETREGSET; it might be better to return an error before changing anything. But it seems the "clear on error" behavior is fine for now, and it's a little tricky to do otherwise because it would mean we couldn't simply copy the full userspace state into kernel memory in one __copy_from_user(). This bug was found by syzkaller, which hit the above-mentioned WARN_ON_FPU(): WARNING: CPU: 1 PID: 0 at ./arch/x86/include/asm/fpu/internal.h:373 __switch_to+0x5b5/0x5d0 CPU: 1 PID: 0 Comm: swapper/1 Not tainted 4.13.0 #453 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 task: ffff9ba2bc8e42c0 task.stack: ffffa78cc036c000 RIP: 0010:__switch_to+0x5b5/0x5d0 RSP: 0000:ffffa78cc08bbb88 EFLAGS: 00010082 RAX: 00000000fffffffe RBX: ffff9ba2b8bf2180 RCX: 00000000c0000100 RDX: 00000000ffffffff RSI: 000000005cb10700 RDI: ffff9ba2b8bf36c0 RBP: ffffa78cc08bbbd0 R08: 00000000929fdf46 R09: 0000000000000001 R10: 0000000000000000 R11: 0000000000000000 R12: ffff9ba2bc8e42c0 R13: 0000000000000000 R14: ffff9ba2b8bf3680 R15: ffff9ba2bf5d7b40 FS: 00007f7e5cb10700(0000) GS:ffff9ba2bf400000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00000000004005cc CR3: 0000000079fd5000 CR4: 00000000001406e0 Call Trace: Code: 84 00 00 00 00 00 e9 11 fd ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 e7 fa ff ff 0f ff 66 0f 1f 84 00 00 00 00 00 e9 c2 fa ff ff <0f> ff 66 0f 1f 84 00 00 00 00 00 e9 d4 fc ff ff 66 66 2e 0f 1f Here is a C reproducer. The expected behavior is that the program spin forever with no output. However, on a buggy kernel running on a processor with the "xsave" feature but without the "xsaves" feature (e.g. Sandy Bridge through Broadwell for Intel), within a second or two the program reports that the xmm registers were corrupted, i.e. were not restored correctly. With CONFIG_X86_DEBUG_FPU=y it also hits the above kernel warning. #define _GNU_SOURCE #include #include #include #include #include #include #include #include int main(void) { int pid = fork(); uint64_t xstate[512]; struct iovec iov = { .iov_base = xstate, .iov_len = sizeof(xstate) }; if (pid == 0) { bool tracee = true; for (int i = 0; i < sysconf(_SC_NPROCESSORS_ONLN) && tracee; i++) tracee = (fork() != 0); uint32_t xmm0[4] = { [0 ... 3] = tracee ? 0x00000000 : 0xDEADBEEF }; asm volatile(" movdqu %0, %%xmm0\n" " mov %0, %%rbx\n" "1: movdqu %%xmm0, %0\n" " mov %0, %%rax\n" " cmp %%rax, %%rbx\n" " je 1b\n" : "+m" (xmm0) : : "rax", "rbx", "xmm0"); printf("BUG: xmm registers corrupted! tracee=%d, xmm0=%08X%08X%08X%08X\n", tracee, xmm0[0], xmm0[1], xmm0[2], xmm0[3]); } else { usleep(100000); ptrace(PTRACE_ATTACH, pid, 0, 0); wait(NULL); ptrace(PTRACE_GETREGSET, pid, NT_X86_XSTATE, &iov); xstate[65] = -1; ptrace(PTRACE_SETREGSET, pid, NT_X86_XSTATE, &iov); ptrace(PTRACE_CONT, pid, 0, 0); wait(NULL); } return 1; } Note: the program only tests for the bug using the ptrace() system call. The bug can also be reproduced using the rt_sigreturn() system call, but only when called from a 32-bit program, since for 64-bit programs the kernel restores the FPU state from the signal frame by doing XRSTOR directly from userspace memory (with proper error checking). Reported-by: Dmitry Vyukov Signed-off-by: Eric Biggers Reviewed-by: Kees Cook Reviewed-by: Rik van Riel Acked-by: Dave Hansen Cc: Andrew Morton Cc: Andy Lutomirski Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Eric Biggers Cc: Fenghua Yu Cc: Kevin Hao Cc: Linus Torvalds Cc: Michael Halcrow Cc: Oleg Nesterov Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Wanpeng Li Cc: Yu-cheng Yu Cc: kernel-hardening@lists.openwall.com Fixes: 0b29643a5843 ("x86/xsaves: Change compacted format xsave area header") Link: http://lkml.kernel.org/r/20170922174156.16780-2-ebiggers3@gmail.com Link: http://lkml.kernel.org/r/20170923130016.21448-25-mingo@kernel.org Signed-off-by: Ingo Molnar Signed-off-by: Greg Kroah-Hartman --- arch/x86/kernel/fpu/regset.c | 11 +++++++++++ arch/x86/kernel/fpu/signal.c | 4 +++- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/x86/kernel/fpu/regset.c b/arch/x86/kernel/fpu/regset.c index 0bc3490420c5..72a483c295f2 100644 --- a/arch/x86/kernel/fpu/regset.c +++ b/arch/x86/kernel/fpu/regset.c @@ -116,6 +116,11 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, xsave = &fpu->state.xsave; ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; + /* * mxcsr reserved bits must be masked to zero for security reasons. */ @@ -126,6 +131,12 @@ int xstateregs_set(struct task_struct *target, const struct user_regset *regset, */ memset(&xsave->header.reserved, 0, 48); + /* + * In case of failure, mark all states as init: + */ + if (ret) + fpstate_init(&fpu->state); + return ret; } diff --git a/arch/x86/kernel/fpu/signal.c b/arch/x86/kernel/fpu/signal.c index 31c6a60505e6..3de077116218 100644 --- a/arch/x86/kernel/fpu/signal.c +++ b/arch/x86/kernel/fpu/signal.c @@ -309,7 +309,9 @@ static int __fpu__restore_sig(void __user *buf, void __user *buf_fx, int size) fpu__drop(fpu); if (__copy_from_user(&fpu->state.xsave, buf_fx, state_size) || - __copy_from_user(&env, buf, sizeof(env))) { + __copy_from_user(&env, buf, sizeof(env)) || + (state_size > offsetof(struct xregs_state, header) && + fpu->state.xsave.header.xcomp_bv)) { fpstate_init(&fpu->state); err = -1; } else { From ddf25aea679de86150a34821ed7144d670db2aa9 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Tue, 19 Sep 2017 07:15:35 -0500 Subject: [PATCH 282/546] gfs2: Fix debugfs glocks dump commit 10201655b085df8e000822e496e5d4016a167a36 upstream. The switch to rhashtables (commit 88ffbf3e03) broke the debugfs glock dump (/sys/kernel/debug/gfs2//glocks) for dumps bigger than a single buffer: the right function for restarting an rhashtable iteration from the beginning of the hash table is rhashtable_walk_enter; rhashtable_walk_stop + rhashtable_walk_start will just resume from the current position. The upstream commit doesn't directly apply to 4.4.y because 4.4.y doesn't have rhashtable_walk_enter and the following mainline commits: 92ecd73a887c4a2b94daf5fc35179d75d1c4ef95 gfs2: Deduplicate gfs2_{glocks,glstats}_open cc37a62785a584f4875788689f3fd1fa6e4eb291 gfs2: Replace rhashtable_walk_init with rhashtable_walk_enter Other than rhashtable_walk_enter, rhashtable_walk_init can fail. To handle the failure case in gfs2_glock_seq_stop, we check if rhashtable_walk_init has initialized iter->walker; if it has not, we must not call rhashtable_walk_stop or rhashtable_walk_exit. Signed-off-by: Andreas Gruenbacher Signed-off-by: Bob Peterson Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/glock.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 070901e76653..ff36f5475d7e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1814,13 +1814,10 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) { struct gfs2_glock_iter *gi = seq->private; loff_t n = *pos; - int ret; - if (gi->last_pos <= *pos) - n = (*pos - gi->last_pos); - - ret = rhashtable_walk_start(&gi->hti); - if (ret) + if (rhashtable_walk_init(&gl_hash_table, &gi->hti) != 0) + return NULL; + if (rhashtable_walk_start(&gi->hti) != 0) return NULL; do { @@ -1828,6 +1825,7 @@ static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) } while (gi->gl && n--); gi->last_pos = *pos; + return gi->gl; } @@ -1839,6 +1837,7 @@ static void *gfs2_glock_seq_next(struct seq_file *seq, void *iter_ptr, (*pos)++; gi->last_pos = *pos; gfs2_glock_iter_next(gi); + return gi->gl; } @@ -1847,7 +1846,10 @@ static void gfs2_glock_seq_stop(struct seq_file *seq, void *iter_ptr) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_stop(&gi->hti); + if (gi->hti.walker) { + rhashtable_walk_stop(&gi->hti); + rhashtable_walk_exit(&gi->hti); + } } static int gfs2_glock_seq_show(struct seq_file *seq, void *iter_ptr) @@ -1910,12 +1912,10 @@ static int gfs2_glocks_open(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti); } return ret; } @@ -1926,7 +1926,6 @@ static int gfs2_glocks_release(struct inode *inode, struct file *file) struct gfs2_glock_iter *gi = seq->private; gi->gl = NULL; - rhashtable_walk_exit(&gi->hti); return seq_release_private(inode, file); } @@ -1938,12 +1937,10 @@ static int gfs2_glstats_open(struct inode *inode, struct file *file) struct seq_file *seq = file->private_data; struct gfs2_glock_iter *gi = seq->private; gi->sdp = inode->i_private; - gi->last_pos = 0; seq->buf = kmalloc(GFS2_SEQ_GOODSIZE, GFP_KERNEL | __GFP_NOWARN); if (seq->buf) seq->size = GFS2_SEQ_GOODSIZE; gi->gl = NULL; - ret = rhashtable_walk_init(&gl_hash_table, &gi->hti); } return ret; } From 5e9b526fcc907c17c6d62ce0dc7d044d6613d419 Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Wed, 19 Apr 2017 15:24:50 -0700 Subject: [PATCH 283/546] timer/sysclt: Restrict timer migration sysctl values to 0 and 1 commit b94bf594cf8ed67cdd0439e70fa939783471597a upstream. timer_migration sysctl acts as a boolean switch, so the allowed values should be restricted to 0 and 1. Add the necessary extra fields to the sysctl table entry to enforce that. [ tglx: Rewrote changelog ] Signed-off-by: Myungho Jung Link: http://lkml.kernel.org/r/1492640690-3550-1-git-send-email-mhjungk@gmail.com Signed-off-by: Thomas Gleixner Cc: Kazuhiro Hayashi Signed-off-by: Greg Kroah-Hartman --- kernel/sysctl.c | 2 ++ kernel/time/timer.c | 2 +- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 002ec084124b..17c59e78661b 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1159,6 +1159,8 @@ static struct ctl_table kern_table[] = { .maxlen = sizeof(unsigned int), .mode = 0644, .proc_handler = timer_migration_handler, + .extra1 = &zero, + .extra2 = &one, }, #endif #ifdef CONFIG_BPF_SYSCALL diff --git a/kernel/time/timer.c b/kernel/time/timer.c index bbc5d1114583..125407144c01 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -127,7 +127,7 @@ int timer_migration_handler(struct ctl_table *table, int write, int ret; mutex_lock(&mutex); - ret = proc_dointvec(table, write, buffer, lenp, ppos); + ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); if (!ret && write) timers_update_migration(false); mutex_unlock(&mutex); From fc39e561e3430694e366e228354d16abbd30ba13 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:49 +0800 Subject: [PATCH 284/546] KVM: VMX: do not change SN bit in vmx_update_pi_irte() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit dc91f2eb1a4021eb6705c15e474942f84ab9b211 upstream. In kvm_vcpu_trigger_posted_interrupt() and pi_pre_block(), KVM assumes that PI notification events should not be suppressed when the target vCPU is not blocked. vmx_update_pi_irte() sets the SN field before changing an interrupt from posting to remapping, but it does not check the vCPU mode. Therefore, the change of SN field may break above the assumption. Besides, I don't see reasons to suppress notification events here, so remove the changes of SN field to avoid race condition. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 6 +----- 1 file changed, 1 insertion(+), 5 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index ee7ae9e937b2..5aeddea1e9d1 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10803,12 +10803,8 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (set) ret = irq_set_vcpu_affinity(host_irq, &vcpu_info); - else { - /* suppress notification event before unposting */ - pi_set_sn(vcpu_to_pi_desc(vcpu)); + else ret = irq_set_vcpu_affinity(host_irq, NULL); - pi_clear_sn(vcpu_to_pi_desc(vcpu)); - } if (ret < 0) { printk(KERN_INFO "%s: failed to update PI IRTE\n", From 9037837e0c32f2a90fbd0824f271b23d49e3eb35 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Mon, 18 Sep 2017 09:56:50 +0800 Subject: [PATCH 285/546] KVM: VMX: remove WARN_ON_ONCE in kvm_vcpu_trigger_posted_interrupt MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 5753743fa5108b8f98bd61e40dc63f641b26c768 upstream. WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)) in kvm_vcpu_trigger_posted_interrupt() intends to detect the violation of invariant that VT-d PI notification event is not suppressed when vcpu is in the guest mode. Because the two checks for the target vcpu mode and the target suppress field cannot be performed atomically, the target vcpu mode may change in between. If that does happen, WARN_ON_ONCE() here may raise false alarms. As the previous patch fixed the real invariant breaker, remove this WARN_ON_ONCE() to avoid false alarms, and document the allowed cases instead. Signed-off-by: Haozhong Zhang Reported-by: "Ramamurthy, Venkatesh" Reported-by: Dan Williams Reviewed-by: Paolo Bonzini Fixes: 28b835d60fcc ("KVM: Update Posted-Interrupts Descriptor when vCPU is preempted") Signed-off-by: Radim Krčmář Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 33 +++++++++++++++++++++------------ 1 file changed, 21 insertions(+), 12 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 5aeddea1e9d1..67f27cc1d1b6 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -4541,21 +4541,30 @@ static inline bool kvm_vcpu_trigger_posted_interrupt(struct kvm_vcpu *vcpu) { #ifdef CONFIG_SMP if (vcpu->mode == IN_GUEST_MODE) { - struct vcpu_vmx *vmx = to_vmx(vcpu); - /* - * Currently, we don't support urgent interrupt, - * all interrupts are recognized as non-urgent - * interrupt, so we cannot post interrupts when - * 'SN' is set. + * The vector of interrupt to be delivered to vcpu had + * been set in PIR before this function. * - * If the vcpu is in guest mode, it means it is - * running instead of being scheduled out and - * waiting in the run queue, and that's the only - * case when 'SN' is set currently, warning if - * 'SN' is set. + * Following cases will be reached in this block, and + * we always send a notification event in all cases as + * explained below. + * + * Case 1: vcpu keeps in non-root mode. Sending a + * notification event posts the interrupt to vcpu. + * + * Case 2: vcpu exits to root mode and is still + * runnable. PIR will be synced to vIRR before the + * next vcpu entry. Sending a notification event in + * this case has no effect, as vcpu is not in root + * mode. + * + * Case 3: vcpu exits to root mode and is blocked. + * vcpu_block() has already synced PIR to vIRR and + * never blocks vcpu if vIRR is not cleared. Therefore, + * a blocked vcpu here does not wait for any requested + * interrupts in PIR, and sending a notification event + * which has no effect is safe here. */ - WARN_ON_ONCE(pi_test_sn(&vmx->pi_desc)); apic->send_IPI_mask(get_cpu_mask(vcpu->cpu), POSTED_INTR_VECTOR); From 6124ed1a712a2dc886abb826748a7a3254186c6c Mon Sep 17 00:00:00 2001 From: Frederic Barrat Date: Wed, 30 Aug 2017 12:15:49 +0200 Subject: [PATCH 286/546] cxl: Fix driver use count commit 197267d0356004a31c4d6b6336598f5dff3301e1 upstream. cxl keeps a driver use count, which is used with the hash memory model on p8 to know when to upgrade local TLBIs to global and to trigger callbacks to manage the MMU for PSL8. If a process opens a context and closes without attaching or fails the attachment, the driver use count is never decremented. As a consequence, TLB invalidations remain global, even if there are no active cxl contexts. We should increment the driver use count when the process is attaching to the cxl adapter, and not on open. It's not needed before the adapter starts using the context and the use count is decremented on the detach path, so it makes more sense. It affects only the user api. The kernel api is already doing The Right Thing. Signed-off-by: Frederic Barrat Fixes: 7bb5d91a4dda ("cxl: Rework context lifetimes") Acked-by: Andrew Donnellan Signed-off-by: Michael Ellerman [ajd: backport to stable v4.4 tree] Signed-off-by: Andrew Donnellan Signed-off-by: Greg Kroah-Hartman --- drivers/misc/cxl/api.c | 4 ++++ drivers/misc/cxl/file.c | 8 +++++++- 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/misc/cxl/api.c b/drivers/misc/cxl/api.c index ea3eeb7011e1..690eb1a18caf 100644 --- a/drivers/misc/cxl/api.c +++ b/drivers/misc/cxl/api.c @@ -176,6 +176,10 @@ int cxl_start_context(struct cxl_context *ctx, u64 wed, kernel = false; } + /* + * Increment driver use count. Enables global TLBIs for hash + * and callbacks to handle the segment table + */ cxl_ctx_get(); if ((rc = cxl_attach_process(ctx, kernel, wed , 0))) { diff --git a/drivers/misc/cxl/file.c b/drivers/misc/cxl/file.c index 10a02934bfc0..013558f4da4f 100644 --- a/drivers/misc/cxl/file.c +++ b/drivers/misc/cxl/file.c @@ -94,7 +94,6 @@ static int __afu_open(struct inode *inode, struct file *file, bool master) pr_devel("afu_open pe: %i\n", ctx->pe); file->private_data = ctx; - cxl_ctx_get(); /* indicate success */ rc = 0; @@ -205,11 +204,18 @@ static long afu_ioctl_start_work(struct cxl_context *ctx, ctx->pid = get_task_pid(current, PIDTYPE_PID); ctx->glpid = get_task_pid(current->group_leader, PIDTYPE_PID); + /* + * Increment driver use count. Enables global TLBIs for hash + * and callbacks to handle the segment table + */ + cxl_ctx_get(); + trace_cxl_attach(ctx, work.work_element_descriptor, work.num_interrupts, amr); if ((rc = cxl_attach_process(ctx, false, work.work_element_descriptor, amr))) { afu_release_irqs(ctx, ctx); + cxl_ctx_put(); goto out; } From a85f176c857e4fe0d2c0b5d2c7090c422982e585 Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 15 Feb 2016 21:57:46 +0100 Subject: [PATCH 287/546] dmaengine: mmp-pdma: add number of requestors commit c283e41ef32442f41e7180f9bb1c5aedf9255bfe upstream. The DMA chip has a fixed number of requestor lines used for flow control. This number is platform dependent. The pxa_dma dma driver will use this value to activate or not the flow control. There won't be any impact on mmp_pdma driver. Signed-off-by: Robert Jarzmik Signed-off-by: Greg Kroah-Hartman --- include/linux/platform_data/mmp_dma.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/platform_data/mmp_dma.h b/include/linux/platform_data/mmp_dma.h index 2a330ec9e2af..d1397c8ed94e 100644 --- a/include/linux/platform_data/mmp_dma.h +++ b/include/linux/platform_data/mmp_dma.h @@ -14,6 +14,7 @@ struct mmp_dma_platdata { int dma_channels; + int nb_requestors; }; #endif /* MMP_DMA_H */ From c575be9a393fd88267cf42dd6af35f1a1f2a363a Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Mon, 15 Feb 2016 21:57:47 +0100 Subject: [PATCH 288/546] ARM: pxa: add the number of DMA requestor lines commit 72b195cb716284217e8b270af420bc7e5cf04b3c upstream. Declare the number of DMA requestor lines per platform : - for pxa25x: 40 requestor lines - for pxa27x: 75 requestor lines - for pxa3xx: 100 requestor lines This information will be used to activate the DMA flow control or not. Signed-off-by: Robert Jarzmik Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/pxa27x.dtsi | 1 + arch/arm/boot/dts/pxa3xx.dtsi | 1 + arch/arm/mach-pxa/devices.c | 3 ++- arch/arm/mach-pxa/pxa25x.c | 2 +- arch/arm/mach-pxa/pxa27x.c | 2 +- arch/arm/mach-pxa/pxa3xx.c | 2 +- arch/arm/plat-pxa/include/plat/dma.h | 2 +- 7 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/arm/boot/dts/pxa27x.dtsi b/arch/arm/boot/dts/pxa27x.dtsi index 7f68a1ee7073..210192c38df3 100644 --- a/arch/arm/boot/dts/pxa27x.dtsi +++ b/arch/arm/boot/dts/pxa27x.dtsi @@ -13,6 +13,7 @@ pdma: dma-controller@40000000 { interrupts = <25>; #dma-channels = <32>; #dma-cells = <2>; + #dma-requests = <75>; status = "okay"; }; diff --git a/arch/arm/boot/dts/pxa3xx.dtsi b/arch/arm/boot/dts/pxa3xx.dtsi index 564341af7e97..fec47bcd8292 100644 --- a/arch/arm/boot/dts/pxa3xx.dtsi +++ b/arch/arm/boot/dts/pxa3xx.dtsi @@ -12,6 +12,7 @@ pdma: dma-controller@40000000 { interrupts = <25>; #dma-channels = <32>; #dma-cells = <2>; + #dma-requests = <100>; status = "okay"; }; diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c index 2a6e0ae2b920..a944797e9d97 100644 --- a/arch/arm/mach-pxa/devices.c +++ b/arch/arm/mach-pxa/devices.c @@ -1203,6 +1203,7 @@ void __init pxa2xx_set_spi_info(unsigned id, struct pxa2xx_spi_master *info) static struct mmp_dma_platdata pxa_dma_pdata = { .dma_channels = 0, + .nb_requestors = 0, }; static struct resource pxa_dma_resource[] = { @@ -1231,7 +1232,7 @@ static struct platform_device pxa2xx_pxa_dma = { .resource = pxa_dma_resource, }; -void __init pxa2xx_set_dmac_info(int nb_channels) +void __init pxa2xx_set_dmac_info(int nb_channels, int nb_requestors) { pxa_dma_pdata.dma_channels = nb_channels; pxa_register_device(&pxa2xx_pxa_dma, &pxa_dma_pdata); diff --git a/arch/arm/mach-pxa/pxa25x.c b/arch/arm/mach-pxa/pxa25x.c index 1dc85ffc3e20..049b9cc22720 100644 --- a/arch/arm/mach-pxa/pxa25x.c +++ b/arch/arm/mach-pxa/pxa25x.c @@ -206,7 +206,7 @@ static int __init pxa25x_init(void) register_syscore_ops(&pxa_irq_syscore_ops); register_syscore_ops(&pxa2xx_mfp_syscore_ops); - pxa2xx_set_dmac_info(16); + pxa2xx_set_dmac_info(16, 40); pxa_register_device(&pxa25x_device_gpio, &pxa25x_gpio_info); ret = platform_add_devices(pxa25x_devices, ARRAY_SIZE(pxa25x_devices)); diff --git a/arch/arm/mach-pxa/pxa27x.c b/arch/arm/mach-pxa/pxa27x.c index ffc424028557..2fb6430b7a34 100644 --- a/arch/arm/mach-pxa/pxa27x.c +++ b/arch/arm/mach-pxa/pxa27x.c @@ -309,7 +309,7 @@ static int __init pxa27x_init(void) if (!of_have_populated_dt()) { pxa_register_device(&pxa27x_device_gpio, &pxa27x_gpio_info); - pxa2xx_set_dmac_info(32); + pxa2xx_set_dmac_info(32, 75); ret = platform_add_devices(devices, ARRAY_SIZE(devices)); } diff --git a/arch/arm/mach-pxa/pxa3xx.c b/arch/arm/mach-pxa/pxa3xx.c index 20ce2d386f17..ca06f082497c 100644 --- a/arch/arm/mach-pxa/pxa3xx.c +++ b/arch/arm/mach-pxa/pxa3xx.c @@ -450,7 +450,7 @@ static int __init pxa3xx_init(void) if (of_have_populated_dt()) return 0; - pxa2xx_set_dmac_info(32); + pxa2xx_set_dmac_info(32, 100); ret = platform_add_devices(devices, ARRAY_SIZE(devices)); if (ret) return ret; diff --git a/arch/arm/plat-pxa/include/plat/dma.h b/arch/arm/plat-pxa/include/plat/dma.h index 28848b344e2d..ceba3e4184fc 100644 --- a/arch/arm/plat-pxa/include/plat/dma.h +++ b/arch/arm/plat-pxa/include/plat/dma.h @@ -95,6 +95,6 @@ static inline int pxad_toggle_reserved_channel(int legacy_channel) } #endif -extern void __init pxa2xx_set_dmac_info(int nb_channels); +extern void __init pxa2xx_set_dmac_info(int nb_channels, int nb_requestors); #endif /* __PLAT_DMA_H */ From 90df2daa1da071bd0c2766b0c5bd9abbee08e3d7 Mon Sep 17 00:00:00 2001 From: Robert Jarzmik Date: Wed, 9 Mar 2016 00:46:11 +0100 Subject: [PATCH 289/546] ARM: pxa: fix the number of DMA requestor lines commit 4c35430ad18f5a034302cb90e559ede5a27f93b9 upstream. The number of requestor lines was clamped to 0 for all pxa architectures in the requestor declaration. Fix this by using the value. Fixes: 72b195cb7162 ("ARM: pxa: add the number of DMA requestor lines") Signed-off-by: Robert Jarzmik Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-pxa/devices.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/arm/mach-pxa/devices.c b/arch/arm/mach-pxa/devices.c index a944797e9d97..614e9d8f0a54 100644 --- a/arch/arm/mach-pxa/devices.c +++ b/arch/arm/mach-pxa/devices.c @@ -1235,5 +1235,6 @@ static struct platform_device pxa2xx_pxa_dma = { void __init pxa2xx_set_dmac_info(int nb_channels, int nb_requestors) { pxa_dma_pdata.dma_channels = nb_channels; + pxa_dma_pdata.nb_requestors = nb_requestors; pxa_register_device(&pxa2xx_pxa_dma, &pxa_dma_pdata); } From 150cd84bb6ea56382cdf55043bda98b5244e41e7 Mon Sep 17 00:00:00 2001 From: Paolo Bonzini Date: Thu, 28 Sep 2017 17:58:41 +0200 Subject: [PATCH 290/546] KVM: VMX: use cmpxchg64 commit c0a1666bcb2a33e84187a15eabdcd54056be9a97 upstream. This fixes a compilation failure on 32-bit systems. Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index 67f27cc1d1b6..a018dff00808 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -2029,8 +2029,8 @@ static void vmx_vcpu_pi_load(struct kvm_vcpu *vcpu, int cpu) /* Allow posting non-urgent interrupts */ new.sn = 0; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); } /* * Switches to specified vcpu, until a matching vcpu_put(), but assumes @@ -10705,8 +10705,8 @@ static int vmx_pre_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'wakeup vector' */ new.nv = POSTED_INTR_WAKEUP_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); return 0; } @@ -10737,8 +10737,8 @@ static void vmx_post_block(struct kvm_vcpu *vcpu) /* set 'NV' to 'notification vector' */ new.nv = POSTED_INTR_VECTOR; - } while (cmpxchg(&pi_desc->control, old.control, - new.control) != old.control); + } while (cmpxchg64(&pi_desc->control, old.control, + new.control) != old.control); if(vcpu->pre_pcpu != -1) { spin_lock_irqsave( From 27323cb81eae618e68e4dea1345090c37dee5485 Mon Sep 17 00:00:00 2001 From: Vladis Dronov Date: Mon, 4 Sep 2017 16:00:50 +0200 Subject: [PATCH 291/546] video: fbdev: aty: do not leak uninitialized padding in clk to userspace commit 8e75f7a7a00461ef6d91797a60b606367f6e344d upstream. 'clk' is copied to a userland with padding byte(s) after 'vclk_post_div' field unitialized, leaking data from the stack. Fix this ensuring all of 'clk' is initialized to zero. References: https://github.com/torvalds/linux/pull/441 Reported-by: sohu0106 Signed-off-by: Vladis Dronov Signed-off-by: Bartlomiej Zolnierkiewicz Signed-off-by: Greg Kroah-Hartman --- drivers/video/fbdev/aty/atyfb_base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/video/fbdev/aty/atyfb_base.c b/drivers/video/fbdev/aty/atyfb_base.c index f34ed47fcaf8..7f658fa4d22a 100644 --- a/drivers/video/fbdev/aty/atyfb_base.c +++ b/drivers/video/fbdev/aty/atyfb_base.c @@ -1861,7 +1861,7 @@ static int atyfb_ioctl(struct fb_info *info, u_int cmd, u_long arg) #if defined(DEBUG) && defined(CONFIG_FB_ATY_CT) case ATYIO_CLKR: if (M64_HAS(INTEGRATED)) { - struct atyclk clk; + struct atyclk clk = { 0 }; union aty_pll *pll = &par->pll; u32 dsp_config = pll->ct.dsp_config; u32 dsp_on_off = pll->ct.dsp_on_off; From 079c03f4a915da1279604f88d6c07bf70427ddbc Mon Sep 17 00:00:00 2001 From: Stefano Stabellini Date: Tue, 7 Feb 2017 19:58:02 +0200 Subject: [PATCH 292/546] swiotlb-xen: implement xen_swiotlb_dma_mmap callback commit 7e91c7df29b5e196de3dc6f086c8937973bd0b88 upstream. This function creates userspace mapping for the DMA-coherent memory. Signed-off-by: Stefano Stabellini Signed-off-by: Oleksandr Dmytryshyn Signed-off-by: Andrii Anisov Signed-off-by: Konrad Rzeszutek Wilk Signed-off-by: Greg Kroah-Hartman --- arch/arm/xen/mm.c | 1 + drivers/xen/swiotlb-xen.c | 19 +++++++++++++++++++ include/xen/swiotlb-xen.h | 5 +++++ 3 files changed, 25 insertions(+) diff --git a/arch/arm/xen/mm.c b/arch/arm/xen/mm.c index c5f9a9e3d1f3..28d83f536e93 100644 --- a/arch/arm/xen/mm.c +++ b/arch/arm/xen/mm.c @@ -199,6 +199,7 @@ static struct dma_map_ops xen_swiotlb_dma_ops = { .unmap_page = xen_swiotlb_unmap_page, .dma_supported = xen_swiotlb_dma_supported, .set_dma_mask = xen_swiotlb_set_dma_mask, + .mmap = xen_swiotlb_dma_mmap, }; int __init xen_mm_init(void) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 8a58bbc14de2..622f805fb382 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -680,3 +680,22 @@ xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask) return 0; } EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); + +/* + * Create userspace mapping for the DMA-coherent memory. + * This function should be called with the pages from the current domain only, + * passing pages mapped from other domains would lead to memory corruption. + */ +int +xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs) +{ +#if defined(CONFIG_ARM) || defined(CONFIG_ARM64) + if (__generic_dma_ops(dev)->mmap) + return __generic_dma_ops(dev)->mmap(dev, vma, cpu_addr, + dma_addr, size, attrs); +#endif + return dma_common_mmap(dev, vma, cpu_addr, dma_addr, size); +} +EXPORT_SYMBOL_GPL(xen_swiotlb_dma_mmap); diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index 8b2eb93ae8ba..fab4fb9c6442 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -58,4 +58,9 @@ xen_swiotlb_dma_supported(struct device *hwdev, u64 mask); extern int xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask); + +extern int +xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, + void *cpu_addr, dma_addr_t dma_addr, size_t size, + unsigned long attrs); #endif /* __LINUX_SWIOTLB_XEN_H */ From 228969b4764fe2b0f58ef096f63666196f7b4881 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 4 Oct 2017 15:51:29 +0200 Subject: [PATCH 293/546] fix xen_swiotlb_dma_mmap prototype xen_swiotlb_dma_mmap was backported from v4.10, but older kernels before commit 00085f1efa38 ("dma-mapping: use unsigned long for dma_attrs") use a different signature: arm/xen/mm.c:202:10: error: initialization from incompatible pointer type [-Werror=incompatible-pointer-types] .mmap = xen_swiotlb_dma_mmap, ^~~~~~~~~~~~~~~~~~~~ arm/xen/mm.c:202:10: note: (near initialization for 'xen_swiotlb_dma_ops.mmap') This adapts the patch to the old calling conventions. Fixes: "swiotlb-xen: implement xen_swiotlb_dma_mmap callback" Signed-off-by: Arnd Bergmann Reviewed-by: Stefano Stabellini Signed-off-by: Greg Kroah-Hartman --- drivers/xen/swiotlb-xen.c | 2 +- include/xen/swiotlb-xen.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/xen/swiotlb-xen.c b/drivers/xen/swiotlb-xen.c index 622f805fb382..f7b19c25c3a4 100644 --- a/drivers/xen/swiotlb-xen.c +++ b/drivers/xen/swiotlb-xen.c @@ -689,7 +689,7 @@ EXPORT_SYMBOL_GPL(xen_swiotlb_set_dma_mask); int xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs) + struct dma_attrs *attrs) { #if defined(CONFIG_ARM) || defined(CONFIG_ARM64) if (__generic_dma_ops(dev)->mmap) diff --git a/include/xen/swiotlb-xen.h b/include/xen/swiotlb-xen.h index fab4fb9c6442..4d7fdbf20eff 100644 --- a/include/xen/swiotlb-xen.h +++ b/include/xen/swiotlb-xen.h @@ -62,5 +62,5 @@ xen_swiotlb_set_dma_mask(struct device *dev, u64 dma_mask); extern int xen_swiotlb_dma_mmap(struct device *dev, struct vm_area_struct *vma, void *cpu_addr, dma_addr_t dma_addr, size_t size, - unsigned long attrs); + struct dma_attrs *attrs); #endif /* __LINUX_SWIOTLB_XEN_H */ From 37c2d0d3e85014b3e92ea61668c51503965e4c24 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 5 Oct 2017 09:41:59 +0200 Subject: [PATCH 294/546] Linux 4.4.90 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index 7e4c46b375b3..ca5aaaf4aef7 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 89 +SUBLEVEL = 90 EXTRAVERSION = NAME = Blurry Fish Butt From 771dacea92cd1b6107615aede467bdf62ef8907c Mon Sep 17 00:00:00 2001 From: "Kristian H. Kristensen" Date: Tue, 13 Dec 2016 11:27:52 -0800 Subject: [PATCH 295/546] drm_fourcc: Fix DRM_FORMAT_MOD_LINEAR #define [ Upstream commit af913418261d6d3e7a29f06cf35f04610ead667c ] We need to define DRM_FORMAT_MOD_VENDOR_NONE for the fourcc_mod_code() macro to work correctly. Signed-off-by: Kristian H. Kristensen Signed-off-by: Daniel Vetter Link: http://patchwork.freedesktop.org/patch/msgid/1481657272-25975-1-git-send-email-hoegsberg@google.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/uapi/drm/drm_fourcc.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/drm/drm_fourcc.h b/include/uapi/drm/drm_fourcc.h index 0b69a7753558..f28f79966e9e 100644 --- a/include/uapi/drm/drm_fourcc.h +++ b/include/uapi/drm/drm_fourcc.h @@ -150,6 +150,7 @@ /* Vendor Ids: */ #define DRM_FORMAT_MOD_NONE 0 +#define DRM_FORMAT_MOD_VENDOR_NONE 0 #define DRM_FORMAT_MOD_VENDOR_INTEL 0x01 #define DRM_FORMAT_MOD_VENDOR_AMD 0x02 #define DRM_FORMAT_MOD_VENDOR_NV 0x03 From 11bf4a8e1d5a300b38ca4bbe1156716b0174f2da Mon Sep 17 00:00:00 2001 From: Bartosz Golaszewski Date: Tue, 13 Dec 2016 11:09:16 +0100 Subject: [PATCH 296/546] drm: bridge: add DT bindings for TI ths8135 [ Upstream commit 2e644be30fcc08c736f66b60f4898d274d4873ab ] THS8135 is a configurable video DAC. Add DT bindings for this chip. Signed-off-by: Bartosz Golaszewski Reviewed-by: Laurent Pinchart Acked-by: Rob Herring Signed-off-by: Archit Taneja Link: http://patchwork.freedesktop.org/patch/msgid/1481623759-12786-3-git-send-email-bgolaszewski@baylibre.com Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../bindings/display/bridge/ti,ths8135.txt | 46 +++++++++++++++++++ 1 file changed, 46 insertions(+) create mode 100644 Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt diff --git a/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt new file mode 100644 index 000000000000..6ec1a880ac18 --- /dev/null +++ b/Documentation/devicetree/bindings/display/bridge/ti,ths8135.txt @@ -0,0 +1,46 @@ +THS8135 Video DAC +----------------- + +This is the binding for Texas Instruments THS8135 Video DAC bridge. + +Required properties: + +- compatible: Must be "ti,ths8135" + +Required nodes: + +This device has two video ports. Their connections are modelled using the OF +graph bindings specified in Documentation/devicetree/bindings/graph.txt. + +- Video port 0 for RGB input +- Video port 1 for VGA output + +Example +------- + +vga-bridge { + compatible = "ti,ths8135"; + #address-cells = <1>; + #size-cells = <0>; + + ports { + #address-cells = <1>; + #size-cells = <0>; + + port@0 { + reg = <0>; + + vga_bridge_in: endpoint { + remote-endpoint = <&lcdc_out_vga>; + }; + }; + + port@1 { + reg = <1>; + + vga_bridge_out: endpoint { + remote-endpoint = <&vga_con_in>; + }; + }; + }; +}; From d4f97441cb888cf53b88f38ac0dc6bbe3a044910 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Wed, 14 Dec 2016 08:02:03 -0600 Subject: [PATCH 297/546] GFS2: Fix reference to ERR_PTR in gfs2_glock_iter_next [ Upstream commit 14d37564fa3dc4e5d4c6828afcd26ac14e6796c5 ] This patch fixes a place where function gfs2_glock_iter_next can reference an invalid error pointer. Signed-off-by: Dan Carpenter Signed-off-by: Bob Peterson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- fs/gfs2/glock.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index ff36f5475d7e..09a0cf5f3dd8 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -1798,16 +1798,18 @@ void gfs2_glock_exit(void) static void gfs2_glock_iter_next(struct gfs2_glock_iter *gi) { - do { - gi->gl = rhashtable_walk_next(&gi->hti); + while ((gi->gl = rhashtable_walk_next(&gi->hti))) { if (IS_ERR(gi->gl)) { if (PTR_ERR(gi->gl) == -EAGAIN) continue; gi->gl = NULL; + return; } - /* Skip entries for other sb and dead entries */ - } while ((gi->gl) && ((gi->sdp != gi->gl->gl_name.ln_sbd) || - __lockref_is_dead(&gi->gl->gl_lockref))); + /* Skip entries for other sb and dead entries */ + if (gi->sdp == gi->gl->gl_name.ln_sbd && + !__lockref_is_dead(&gi->gl->gl_lockref)) + return; + } } static void *gfs2_glock_seq_start(struct seq_file *seq, loff_t *pos) From 6a501bddeba3406517e0ba3529d9a665709df4a1 Mon Sep 17 00:00:00 2001 From: Santosh Shilimkar Date: Thu, 18 Feb 2016 20:06:47 -0800 Subject: [PATCH 298/546] RDS: RDMA: Fix the composite message user notification [ Upstream commit 941f8d55f6d613a460a5e080d25a38509f45eb75 ] When application sends an RDS RDMA composite message consist of RDMA transfer to be followed up by non RDMA payload, it expect to be notified *only* when the full message gets delivered. RDS RDMA notification doesn't behave this way though. Thanks to Venkat for debug and root casuing the issue where only first part of the message(RDMA) was successfully delivered but remainder payload delivery failed. In that case, application should not be notified with a false positive of message delivery success. Fix this case by making sure the user gets notified only after the full message delivery. Reviewed-by: Venkat Venkatsubra Signed-off-by: Santosh Shilimkar Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/rds/ib_send.c | 25 +++++++++++++++---------- net/rds/rdma.c | 10 ++++++++++ net/rds/rds.h | 1 + net/rds/send.c | 4 +++- 4 files changed, 29 insertions(+), 11 deletions(-) diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c index eac30bf486d7..094e2a12860a 100644 --- a/net/rds/ib_send.c +++ b/net/rds/ib_send.c @@ -68,16 +68,6 @@ static void rds_ib_send_complete(struct rds_message *rm, complete(rm, notify_status); } -static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, - struct rm_data_op *op, - int wc_status) -{ - if (op->op_nents) - ib_dma_unmap_sg(ic->i_cm_id->device, - op->op_sg, op->op_nents, - DMA_TO_DEVICE); -} - static void rds_ib_send_unmap_rdma(struct rds_ib_connection *ic, struct rm_rdma_op *op, int wc_status) @@ -138,6 +128,21 @@ static void rds_ib_send_unmap_atomic(struct rds_ib_connection *ic, rds_ib_stats_inc(s_ib_atomic_fadd); } +static void rds_ib_send_unmap_data(struct rds_ib_connection *ic, + struct rm_data_op *op, + int wc_status) +{ + struct rds_message *rm = container_of(op, struct rds_message, data); + + if (op->op_nents) + ib_dma_unmap_sg(ic->i_cm_id->device, + op->op_sg, op->op_nents, + DMA_TO_DEVICE); + + if (rm->rdma.op_active && rm->data.op_notify) + rds_ib_send_unmap_rdma(ic, &rm->rdma, wc_status); +} + /* * Unmap the resources associated with a struct send_work. * diff --git a/net/rds/rdma.c b/net/rds/rdma.c index 4c93badeabf2..8d3a851a3476 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -626,6 +626,16 @@ int rds_cmsg_rdma_args(struct rds_sock *rs, struct rds_message *rm, } op->op_notifier->n_user_token = args->user_token; op->op_notifier->n_status = RDS_RDMA_SUCCESS; + + /* Enable rmda notification on data operation for composite + * rds messages and make sure notification is enabled only + * for the data operation which follows it so that application + * gets notified only after full message gets delivered. + */ + if (rm->data.op_sg) { + rm->rdma.op_notify = 0; + rm->data.op_notify = !!(args->flags & RDS_RDMA_NOTIFY_ME); + } } /* The cookie contains the R_Key of the remote memory region, and diff --git a/net/rds/rds.h b/net/rds/rds.h index 0e2797bdc316..4588860f4c3b 100644 --- a/net/rds/rds.h +++ b/net/rds/rds.h @@ -378,6 +378,7 @@ struct rds_message { } rdma; struct rm_data_op { unsigned int op_active:1; + unsigned int op_notify:1; unsigned int op_nents; unsigned int op_count; unsigned int op_dmasg; diff --git a/net/rds/send.c b/net/rds/send.c index c9cdb358ea88..6815f03324d7 100644 --- a/net/rds/send.c +++ b/net/rds/send.c @@ -467,12 +467,14 @@ void rds_rdma_send_complete(struct rds_message *rm, int status) struct rm_rdma_op *ro; struct rds_notifier *notifier; unsigned long flags; + unsigned int notify = 0; spin_lock_irqsave(&rm->m_rs_lock, flags); + notify = rm->rdma.op_notify | rm->data.op_notify; ro = &rm->rdma; if (test_bit(RDS_MSG_ON_SOCK, &rm->m_flags) && - ro->op_active && ro->op_notify && ro->op_notifier) { + ro->op_active && notify && ro->op_notifier) { notifier = ro->op_notifier; rs = rm->m_rs; sock_hold(rds_rs_to_sk(rs)); From b00cfc01e70f78bfbc4df8e579e20d53688b209d Mon Sep 17 00:00:00 2001 From: Simon Horman Date: Tue, 20 Dec 2016 11:32:39 +0100 Subject: [PATCH 299/546] ARM: dts: r8a7790: Use R-Car Gen 2 fallback binding for msiof nodes [ Upstream commit 654450baf2afba86cf328e1849ccac61ec4630af ] Use recently added R-Car Gen 2 fallback binding for msiof nodes in DT for r8a7790 SoC. This has no run-time effect for the current driver as the initialisation sequence is the same for the SoC-specific binding for r8a7790 and the fallback binding for R-Car Gen 2. Signed-off-by: Simon Horman Reviewed-by: Geert Uytterhoeven Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/boot/dts/r8a7790.dtsi | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/arch/arm/boot/dts/r8a7790.dtsi b/arch/arm/boot/dts/r8a7790.dtsi index e07ae5d45e19..7b39d8fae61e 100644 --- a/arch/arm/boot/dts/r8a7790.dtsi +++ b/arch/arm/boot/dts/r8a7790.dtsi @@ -1409,7 +1409,8 @@ qspi: spi@e6b10000 { }; msiof0: spi@e6e20000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6e20000 0 0x0064>; interrupts = <0 156 IRQ_TYPE_LEVEL_HIGH>; clocks = <&mstp0_clks R8A7790_CLK_MSIOF0>; @@ -1422,7 +1423,8 @@ msiof0: spi@e6e20000 { }; msiof1: spi@e6e10000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6e10000 0 0x0064>; interrupts = <0 157 IRQ_TYPE_LEVEL_HIGH>; clocks = <&mstp2_clks R8A7790_CLK_MSIOF1>; @@ -1435,7 +1437,8 @@ msiof1: spi@e6e10000 { }; msiof2: spi@e6e00000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6e00000 0 0x0064>; interrupts = <0 158 IRQ_TYPE_LEVEL_HIGH>; clocks = <&mstp2_clks R8A7790_CLK_MSIOF2>; @@ -1448,7 +1451,8 @@ msiof2: spi@e6e00000 { }; msiof3: spi@e6c90000 { - compatible = "renesas,msiof-r8a7790"; + compatible = "renesas,msiof-r8a7790", + "renesas,rcar-gen2-msiof"; reg = <0 0xe6c90000 0 0x0064>; interrupts = <0 159 IRQ_TYPE_LEVEL_HIGH>; clocks = <&mstp2_clks R8A7790_CLK_MSIOF3>; From 1e35a2adc0782ea6ea0571d7e2220a27697adfa5 Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Mon, 7 Nov 2016 11:52:19 +0000 Subject: [PATCH 300/546] MIPS: Ensure bss section ends on a long-aligned address [ Upstream commit 3f00f4d8f083bc61005d0a1ef592b149f5c88bbd ] When clearing the .bss section in kernel_entry we do so using LONG_S instructions, and branch whilst the current write address doesn't equal the end of the .bss section minus the size of a long integer. The .bss section always begins at a long-aligned address and we always increment the write pointer by the size of a long integer - we therefore rely upon the .bss section ending at a long-aligned address. If this is not the case then the long-aligned write address can never be equal to the non-long-aligned end address & we will continue to increment past the end of the .bss section, attempting to zero the rest of memory. Despite this requirement that .bss end at a long-aligned address we pass 0 as the end alignment requirement to the BSS_SECTION macro and thus don't guarantee any particular alignment, allowing us to hit the error condition described above. Fix this by instead passing 8 bytes as the end alignment argument to the BSS_SECTION macro, ensuring that the end of the .bss section is always at least long-aligned. Signed-off-by: Paul Burton Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/14526/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/kernel/vmlinux.lds.S | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/kernel/vmlinux.lds.S b/arch/mips/kernel/vmlinux.lds.S index 0a93e83cd014..2026203c41e2 100644 --- a/arch/mips/kernel/vmlinux.lds.S +++ b/arch/mips/kernel/vmlinux.lds.S @@ -159,7 +159,7 @@ SECTIONS * Force .bss to 64K alignment so that .bss..swapper_pg_dir * gets that alignment. .sbss should be empty, so there will be * no holes after __init_end. */ - BSS_SECTION(0, 0x10000, 0) + BSS_SECTION(0, 0x10000, 8) _end = . ; From 1c3ef07eb8ebf0bedb56aeda2186a7435cc2143a Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Thu, 22 Dec 2016 23:52:58 +0000 Subject: [PATCH 301/546] MIPS: ralink: Fix incorrect assignment on ralink_soc [ Upstream commit 08d90c81b714482dceb5323d14f6617bcf55ee61 ] ralink_soc sould be assigned to RT3883_SOC, replace incorrect comparision with assignment. Signed-off-by: Colin Ian King Fixes: 418d29c87061 ("MIPS: ralink: Unify SoC id handling") Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/14903/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/ralink/rt3883.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/mips/ralink/rt3883.c b/arch/mips/ralink/rt3883.c index 3c575093f8f1..f2a6e1b8cce0 100644 --- a/arch/mips/ralink/rt3883.c +++ b/arch/mips/ralink/rt3883.c @@ -144,5 +144,5 @@ void prom_soc_init(struct ralink_soc_info *soc_info) rt2880_pinmux_data = rt3883_pinmux_data; - ralink_soc == RT3883_SOC; + ralink_soc = RT3883_SOC; } From 5603b10236da437378f6529dba3654b9c280eb15 Mon Sep 17 00:00:00 2001 From: Guilherme G Piccoli Date: Thu, 10 Nov 2016 16:46:43 -0200 Subject: [PATCH 302/546] igb: re-assign hw address pointer on reset after PCI error [ Upstream commit 69b97cf6dbce7403845a28bbc75d57f5be7b12ac ] Whenever the igb driver detects the result of a read operation returns a value composed only by F's (like 0xFFFFFFFF), it will detach the net_device, clear the hw_addr pointer and warn to the user that adapter's link is lost - those steps happen on igb_rd32(). In case a PCI error happens on Power architecture, there's a recovery mechanism called EEH, that will reset the PCI slot and call driver's handlers to reset the adapter and network functionality as well. We observed that once hw_addr is NULL after the error is detected on igb_rd32(), it's never assigned back, so in the process of resetting the network functionality we got a NULL pointer dereference in both igb_configure_tx_ring() and igb_configure_rx_ring(). In order to avoid such bug, this patch re-assigns the hw_addr value in the slot_reset handler. Reported-by: Anthony H Thai Reported-by: Harsha Thyagaraja Signed-off-by: Guilherme G Piccoli Tested-by: Aaron Brown Signed-off-by: Jeff Kirsher Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/intel/igb/igb_main.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index fa3b4cbea23b..a481ea64e287 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -7658,6 +7658,11 @@ static pci_ers_result_t igb_io_slot_reset(struct pci_dev *pdev) pci_enable_wake(pdev, PCI_D3hot, 0); pci_enable_wake(pdev, PCI_D3cold, 0); + /* In case of PCI error, adapter lose its HW address + * so we should re-assign it here. + */ + hw->hw_addr = adapter->io_addr; + igb_reset(adapter); wr32(E1000_WUS, ~0); result = PCI_ERS_RESULT_RECOVERED; From effdf2b134d575f1da5c02a5c821610b153f5cab Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Mon, 19 Dec 2016 01:13:11 +0100 Subject: [PATCH 303/546] extcon: axp288: Use vbus-valid instead of -present to determine cable presence [ Upstream commit 5757aca10146061befd168dab37fb0db1ccd8f73 ] The vbus-present bit in the power status register also gets set to 1 when a usb-host cable (id-pin shorted to ground) is plugged in and a 5v boost converter is supplying 5v to the otg usb bus. This causes a "disconnect or unknown or ID event" warning in dmesg as well as the extcon device to report the last detected charger cable type as being connected even though none is connected. This commit switches to checking the vbus-valid bit instead, which is only 1 when both vbus is present and the vbus-path is enabled in the vbus-path control register (the vbus-path gets disabled when a usb-host cable is detected, to avoid the pmic drawing power from the 5v boost converter). Signed-off-by: Hans de Goede Acked-by: Chanwoo Choi Signed-off-by: Chanwoo Choi Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/extcon/extcon-axp288.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/extcon/extcon-axp288.c b/drivers/extcon/extcon-axp288.c index fd55c2f2080a..6c9d7ccebb8c 100644 --- a/drivers/extcon/extcon-axp288.c +++ b/drivers/extcon/extcon-axp288.c @@ -168,7 +168,7 @@ static int axp288_handle_chrg_det_event(struct axp288_extcon_info *info) return ret; } - vbus_attach = (pwr_stat & PS_STAT_VBUS_PRESENT); + vbus_attach = (pwr_stat & PS_STAT_VBUS_VALID); if (!vbus_attach) goto notify_otg; From d89f41c20f32cd276e716569b60a5eac0ac64ad9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Niklas=20S=C3=B6derlund?= Date: Mon, 9 Jan 2017 16:34:04 +0100 Subject: [PATCH 304/546] sh_eth: use correct name for ECMR_MPDE bit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6dcf45e514974a1ff10755015b5e06746a033e5f ] This bit was wrongly named due to a typo, Sergei checked the SH7734/63 manuals and this bit should be named MPDE. Suggested-by: Sergei Shtylyov Signed-off-by: Niklas Söderlund Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/ethernet/renesas/sh_eth.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/renesas/sh_eth.h b/drivers/net/ethernet/renesas/sh_eth.h index 72fcfc924589..0d18be0fed8e 100644 --- a/drivers/net/ethernet/renesas/sh_eth.h +++ b/drivers/net/ethernet/renesas/sh_eth.h @@ -339,7 +339,7 @@ enum FELIC_MODE_BIT { ECMR_DPAD = 0x00200000, ECMR_RZPF = 0x00100000, ECMR_ZPF = 0x00080000, ECMR_PFR = 0x00040000, ECMR_RXF = 0x00020000, ECMR_TXF = 0x00010000, ECMR_MCT = 0x00002000, ECMR_PRCEF = 0x00001000, - ECMR_PMDE = 0x00000200, ECMR_RE = 0x00000040, ECMR_TE = 0x00000020, + ECMR_MPDE = 0x00000200, ECMR_RE = 0x00000040, ECMR_TE = 0x00000020, ECMR_RTM = 0x00000010, ECMR_ILB = 0x00000008, ECMR_ELB = 0x00000004, ECMR_DM = 0x00000002, ECMR_PRM = 0x00000001, }; From 297b8b01ec278ceadf55cc7f243797700a250a1c Mon Sep 17 00:00:00 2001 From: Guenter Roeck Date: Tue, 27 Dec 2016 14:15:07 -0800 Subject: [PATCH 305/546] hwmon: (gl520sm) Fix overflows and crash seen when writing into limit attributes [ Upstream commit 87cdfa9d60f4f40e6d71b04b10b36d9df3c89282 ] Writes into limit attributes can overflow due to multplications and additions with unbound input values. Writing into fan limit attributes can result in a crash with a division by zero if very large values are written and the fan divider is larger than 1. Signed-off-by: Guenter Roeck Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/hwmon/gl520sm.c | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/drivers/hwmon/gl520sm.c b/drivers/hwmon/gl520sm.c index dee93ec87d02..84e0994aafdd 100644 --- a/drivers/hwmon/gl520sm.c +++ b/drivers/hwmon/gl520sm.c @@ -208,11 +208,13 @@ static ssize_t get_cpu_vid(struct device *dev, struct device_attribute *attr, } static DEVICE_ATTR(cpu0_vid, S_IRUGO, get_cpu_vid, NULL); -#define VDD_FROM_REG(val) (((val) * 95 + 2) / 4) -#define VDD_TO_REG(val) clamp_val((((val) * 4 + 47) / 95), 0, 255) +#define VDD_FROM_REG(val) DIV_ROUND_CLOSEST((val) * 95, 4) +#define VDD_CLAMP(val) clamp_val(val, 0, 255 * 95 / 4) +#define VDD_TO_REG(val) DIV_ROUND_CLOSEST(VDD_CLAMP(val) * 4, 95) -#define IN_FROM_REG(val) ((val) * 19) -#define IN_TO_REG(val) clamp_val((((val) + 9) / 19), 0, 255) +#define IN_FROM_REG(val) ((val) * 19) +#define IN_CLAMP(val) clamp_val(val, 0, 255 * 19) +#define IN_TO_REG(val) DIV_ROUND_CLOSEST(IN_CLAMP(val), 19) static ssize_t get_in_input(struct device *dev, struct device_attribute *attr, char *buf) @@ -349,8 +351,13 @@ static SENSOR_DEVICE_ATTR(in4_max, S_IRUGO | S_IWUSR, #define DIV_FROM_REG(val) (1 << (val)) #define FAN_FROM_REG(val, div) ((val) == 0 ? 0 : (480000 / ((val) << (div)))) -#define FAN_TO_REG(val, div) ((val) <= 0 ? 0 : \ - clamp_val((480000 + ((val) << ((div)-1))) / ((val) << (div)), 1, 255)) + +#define FAN_BASE(div) (480000 >> (div)) +#define FAN_CLAMP(val, div) clamp_val(val, FAN_BASE(div) / 255, \ + FAN_BASE(div)) +#define FAN_TO_REG(val, div) ((val) == 0 ? 0 : \ + DIV_ROUND_CLOSEST(480000, \ + FAN_CLAMP(val, div) << (div))) static ssize_t get_fan_input(struct device *dev, struct device_attribute *attr, char *buf) @@ -513,9 +520,9 @@ static SENSOR_DEVICE_ATTR(fan2_div, S_IRUGO | S_IWUSR, static DEVICE_ATTR(fan1_off, S_IRUGO | S_IWUSR, get_fan_off, set_fan_off); -#define TEMP_FROM_REG(val) (((val) - 130) * 1000) -#define TEMP_TO_REG(val) clamp_val(((((val) < 0 ? \ - (val) - 500 : (val) + 500) / 1000) + 130), 0, 255) +#define TEMP_FROM_REG(val) (((val) - 130) * 1000) +#define TEMP_CLAMP(val) clamp_val(val, -130000, 125000) +#define TEMP_TO_REG(val) (DIV_ROUND_CLOSEST(TEMP_CLAMP(val), 1000) + 130) static ssize_t get_temp_input(struct device *dev, struct device_attribute *attr, char *buf) From a1f7b8ff496db893c6dfb6a1fdc2b23208e6de94 Mon Sep 17 00:00:00 2001 From: Hans de Goede Date: Wed, 14 Dec 2016 14:55:25 +0100 Subject: [PATCH 306/546] iio: adc: axp288: Drop bogus AXP288_ADC_TS_PIN_CTRL register modifications [ Upstream commit fa2849e9649b5180ffc4cb3c3b005261c403093a ] For some reason the axp288_adc driver was modifying the AXP288_ADC_TS_PIN_CTRL register, changing bits 0-1 depending on whether the GP_ADC channel or another channel was written. These bits control when a bias current is send to the TS_PIN, the GP_ADC has its own pin and a separate bit in another register to control the bias current. Not only does changing when to enable the TS_PIN bias current (always or only when sampling) when reading the GP_ADC make no sense at all, the code is modifying these bits is writing the entire register, assuming that all the other bits have their default value. So if the firmware has configured a different bias-current for either pin, then that change gets clobbered by the write, likewise if the firmware has set bit 2 to indicate that the battery has no thermal sensor, this will get clobbered by the write. This commit fixes all this, by simply removing all writes to the AXP288_ADC_TS_PIN_CTRL register, they are not needed to read the GP_ADC pin, and can actually be harmful. Signed-off-by: Hans de Goede Acked-by: Chen-Yu Tsai Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/axp288_adc.c | 32 +------------------------------- 1 file changed, 1 insertion(+), 31 deletions(-) diff --git a/drivers/iio/adc/axp288_adc.c b/drivers/iio/adc/axp288_adc.c index 0c904edd6c00..f684fe31f832 100644 --- a/drivers/iio/adc/axp288_adc.c +++ b/drivers/iio/adc/axp288_adc.c @@ -28,8 +28,6 @@ #include #define AXP288_ADC_EN_MASK 0xF1 -#define AXP288_ADC_TS_PIN_GPADC 0xF2 -#define AXP288_ADC_TS_PIN_ON 0xF3 enum axp288_adc_id { AXP288_ADC_TS, @@ -123,16 +121,6 @@ static int axp288_adc_read_channel(int *val, unsigned long address, return IIO_VAL_INT; } -static int axp288_adc_set_ts(struct regmap *regmap, unsigned int mode, - unsigned long address) -{ - /* channels other than GPADC do not need to switch TS pin */ - if (address != AXP288_GP_ADC_H) - return 0; - - return regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, mode); -} - static int axp288_adc_read_raw(struct iio_dev *indio_dev, struct iio_chan_spec const *chan, int *val, int *val2, long mask) @@ -143,16 +131,7 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev, mutex_lock(&indio_dev->mlock); switch (mask) { case IIO_CHAN_INFO_RAW: - if (axp288_adc_set_ts(info->regmap, AXP288_ADC_TS_PIN_GPADC, - chan->address)) { - dev_err(&indio_dev->dev, "GPADC mode\n"); - ret = -EINVAL; - break; - } ret = axp288_adc_read_channel(val, chan->address, info->regmap); - if (axp288_adc_set_ts(info->regmap, AXP288_ADC_TS_PIN_ON, - chan->address)) - dev_err(&indio_dev->dev, "TS pin restore\n"); break; default: ret = -EINVAL; @@ -162,15 +141,6 @@ static int axp288_adc_read_raw(struct iio_dev *indio_dev, return ret; } -static int axp288_adc_set_state(struct regmap *regmap) -{ - /* ADC should be always enabled for internal FG to function */ - if (regmap_write(regmap, AXP288_ADC_TS_PIN_CTRL, AXP288_ADC_TS_PIN_ON)) - return -EIO; - - return regmap_write(regmap, AXP20X_ADC_EN1, AXP288_ADC_EN_MASK); -} - static const struct iio_info axp288_adc_iio_info = { .read_raw = &axp288_adc_read_raw, .driver_module = THIS_MODULE, @@ -199,7 +169,7 @@ static int axp288_adc_probe(struct platform_device *pdev) * Set ADC to enabled state at all time, including system suspend. * otherwise internal fuel gauge functionality may be affected. */ - ret = axp288_adc_set_state(axp20x->regmap); + ret = regmap_write(info->regmap, AXP20X_ADC_EN1, AXP288_ADC_EN_MASK); if (ret) { dev_err(&pdev->dev, "unable to enable ADC device\n"); return ret; From 89642710fdb3f41e8c0e44901f695d2c48b8ffb7 Mon Sep 17 00:00:00 2001 From: Andreas Klinger Date: Thu, 5 Jan 2017 18:51:36 +0100 Subject: [PATCH 307/546] iio: adc: hx711: Add DT binding for avia,hx711 [ Upstream commit ff1293f67734da68e23fecb6ecdae7112b8c43f9 ] Add DT bindings for avia,hx711 Add vendor avia to vendor list Signed-off-by: Andreas Klinger Acked-by: Rob Herring Signed-off-by: Jonathan Cameron Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- .../devicetree/bindings/iio/adc/avia-hx711.txt | 18 ++++++++++++++++++ .../devicetree/bindings/vendor-prefixes.txt | 1 + 2 files changed, 19 insertions(+) create mode 100644 Documentation/devicetree/bindings/iio/adc/avia-hx711.txt diff --git a/Documentation/devicetree/bindings/iio/adc/avia-hx711.txt b/Documentation/devicetree/bindings/iio/adc/avia-hx711.txt new file mode 100644 index 000000000000..b3629405f568 --- /dev/null +++ b/Documentation/devicetree/bindings/iio/adc/avia-hx711.txt @@ -0,0 +1,18 @@ +* AVIA HX711 ADC chip for weight cells + Bit-banging driver + +Required properties: + - compatible: Should be "avia,hx711" + - sck-gpios: Definition of the GPIO for the clock + - dout-gpios: Definition of the GPIO for data-out + See Documentation/devicetree/bindings/gpio/gpio.txt + - avdd-supply: Definition of the regulator used as analog supply + +Example: +weight@0 { + compatible = "avia,hx711"; + sck-gpios = <&gpio3 10 GPIO_ACTIVE_HIGH>; + dout-gpios = <&gpio0 7 GPIO_ACTIVE_HIGH>; + avdd-suppy = <&avdd>; +}; + diff --git a/Documentation/devicetree/bindings/vendor-prefixes.txt b/Documentation/devicetree/bindings/vendor-prefixes.txt index 55df1d444e9f..98dc17507a84 100644 --- a/Documentation/devicetree/bindings/vendor-prefixes.txt +++ b/Documentation/devicetree/bindings/vendor-prefixes.txt @@ -31,6 +31,7 @@ asahi-kasei Asahi Kasei Corp. atmel Atmel Corporation auo AU Optronics Corporation avago Avago Technologies +avia avia semiconductor avic Shanghai AVIC Optoelectronics Co., Ltd. axis Axis Communications AB bosch Bosch Sensortec GmbH From f97c79e83f7e01ff4e310f0fc4cb41a992ccc5ed Mon Sep 17 00:00:00 2001 From: Afzal Mohammed Date: Sat, 7 Jan 2017 17:48:10 +0100 Subject: [PATCH 308/546] ARM: 8635/1: nommu: allow enabling REMAP_VECTORS_TO_RAM [ Upstream commit 8a792e9afbce84a0fdaf213fe42bb97382487094 ] REMAP_VECTORS_TO_RAM depends on DRAM_BASE, but since DRAM_BASE is a hex, REMAP_VECTORS_TO_RAM could never get enabled. Also depending on DRAM_BASE is redundant as whenever REMAP_VECTORS_TO_RAM makes itself available to Kconfig, DRAM_BASE also is available as the Kconfig gets sourced on !MMU. Signed-off-by: Afzal Mohammed Reviewed-by: Vladimir Murzin Signed-off-by: Russell King Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/arm/Kconfig-nommu | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/arch/arm/Kconfig-nommu b/arch/arm/Kconfig-nommu index aed66d5df7f1..b7576349528c 100644 --- a/arch/arm/Kconfig-nommu +++ b/arch/arm/Kconfig-nommu @@ -34,8 +34,7 @@ config PROCESSOR_ID used instead of the auto-probing which utilizes the register. config REMAP_VECTORS_TO_RAM - bool 'Install vectors to the beginning of RAM' if DRAM_BASE - depends on DRAM_BASE + bool 'Install vectors to the beginning of RAM' help The kernel needs to change the hardware exception vectors. In nommu mode, the hardware exception vectors are normally From 01b3db29ba1edd79b9c3e7cd294739b70257d278 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Mon, 9 Jan 2017 01:26:37 +0100 Subject: [PATCH 309/546] tty: goldfish: Fix a parameter of a call to free_irq [ Upstream commit 1a5c2d1de7d35f5eb9793266237903348989502b ] 'request_irq()' and 'free_irq()' should be called with the same dev_id. Signed-off-by: Christophe JAILLET Signed-off-by: Greg Kroah-Hartman Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/tty/goldfish.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/tty/goldfish.c b/drivers/tty/goldfish.c index 0f82c0b146f6..e04b57f79df8 100644 --- a/drivers/tty/goldfish.c +++ b/drivers/tty/goldfish.c @@ -293,7 +293,7 @@ static int goldfish_tty_probe(struct platform_device *pdev) return 0; err_tty_register_device_failed: - free_irq(irq, pdev); + free_irq(irq, qtty); err_request_irq_failed: goldfish_tty_current_line_count--; if (goldfish_tty_current_line_count == 0) From 9326a1374b13c2b50346c2868aabfedbd71f678d Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 28 Dec 2016 14:47:22 +0200 Subject: [PATCH 310/546] IB/ipoib: Fix deadlock over vlan_mutex [ Upstream commit 1c3098cdb05207e740715857df7b0998e372f527 ] This patch fixes Deadlock while executing ipoib_vlan_delete. The function takes the vlan_rwsem semaphore and calls unregister_netdevice. The later function calls ipoib_mcast_stop_thread that cause workqueue flush. When the queue has one of the ipoib_ib_dev_flush_xxx events, a deadlock occur because these events also tries to catch the same vlan_rwsem semaphore. To fix, unregister_netdevice should be called after releasing the semaphore. Fixes: cbbe1efa4972 ("IPoIB: Fix deadlock between ipoib_open() and child interface create") Signed-off-by: Feras Daoud Signed-off-by: Erez Shitrit Reviewed-by: Alex Vesker Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 57a34f87dedf..3a647fd50f09 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -185,7 +185,6 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) list_for_each_entry_safe(priv, tpriv, &ppriv->child_intfs, list) { if (priv->pkey == pkey && priv->child_type == IPOIB_LEGACY_CHILD) { - unregister_netdevice(priv->dev); list_del(&priv->list); dev = priv->dev; break; @@ -193,6 +192,11 @@ int ipoib_vlan_delete(struct net_device *pdev, unsigned short pkey) } up_write(&ppriv->vlan_rwsem); + if (dev) { + ipoib_dbg(ppriv, "delete child vlan %s\n", dev->name); + unregister_netdevice(dev); + } + rtnl_unlock(); if (dev) { From f1d53c6d484336229f81ef10fb2ce069abe7c678 Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 28 Dec 2016 14:47:24 +0200 Subject: [PATCH 311/546] IB/ipoib: rtnl_unlock can not come after free_netdev [ Upstream commit 89a3987ab7a923c047c6dec008e60ad6f41fac22 ] The ipoib_vlan_add function calls rtnl_unlock after free_netdev, rtnl_unlock not only releases the lock, but also calls netdev_run_todo. The latter function browses the net_todo_list array and completes the unregistration of all its net_device instances. If we call free_netdev before rtnl_unlock, then netdev_run_todo call over the freed device causes panic. To fix, move rtnl_unlock call before free_netdev call. Fixes: 9baa0b036410 ("IB/ipoib: Add rtnl_link_ops support") Cc: Or Gerlitz Signed-off-by: Feras Daoud Signed-off-by: Erez Shitrit Reviewed-by: Yuval Shaia Signed-off-by: Leon Romanovsky Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_vlan.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c index 3a647fd50f09..9b47a437d6c9 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_vlan.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_vlan.c @@ -160,11 +160,11 @@ int ipoib_vlan_add(struct net_device *pdev, unsigned short pkey) out: up_write(&ppriv->vlan_rwsem); + rtnl_unlock(); + if (result) free_netdev(priv->dev); - rtnl_unlock(); - return result; } From bf184ddd2180724aa0a2293415d051dc856e22ee Mon Sep 17 00:00:00 2001 From: Feras Daoud Date: Wed, 28 Dec 2016 14:47:27 +0200 Subject: [PATCH 312/546] IB/ipoib: Replace list_del of the neigh->list with list_del_init [ Upstream commit c586071d1dc8227a7182179b8e50ee92cc43f6d2 ] In order to resolve a situation where a few process delete the same list element in sequence and cause panic, list_del is replaced with list_del_init. In this case if the first process that calls list_del releases the lock before acquiring it again, other processes who can acquire the lock will call list_del_init. Fixes: b63b70d87741 ("IPoIB: Use a private hash table for path lookup") Signed-off-by: Feras Daoud Signed-off-by: Erez Shitrit Reviewed-by: Alex Vesker Signed-off-by: Leon Romanovsky Reviewed-by: Yuval Shaia Signed-off-by: Doug Ledford Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/ulp/ipoib/ipoib_main.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 6699ecd855f0..bad76eed06b3 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1239,7 +1239,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1406,7 +1406,7 @@ void ipoib_neigh_free(struct ipoib_neigh *neigh) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); return; } else { @@ -1491,7 +1491,7 @@ void ipoib_del_neighs_by_gid(struct net_device *dev, u8 *gid) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from parent list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } else { np = &neigh->hnext; @@ -1533,7 +1533,7 @@ static void ipoib_flush_neighs(struct ipoib_dev_priv *priv) rcu_dereference_protected(neigh->hnext, lockdep_is_held(&priv->lock))); /* remove from path/mc list */ - list_del(&neigh->list); + list_del_init(&neigh->list); call_rcu(&neigh->rcu, ipoib_neigh_reclaim); } } From 8b2522eb44aed2b81d0958000a18f2ba8fb59ac3 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Thu, 1 Dec 2016 16:10:42 +0800 Subject: [PATCH 313/546] drm/amdkfd: fix improper return value on error [ Upstream commit 8bf793883da213864efc50c274d2b38ec0ca58b2 ] In function kfd_wait_on_events(), when the call to copy_from_user() fails, the value of return variable ret is 0. 0 indicates success, which is inconsistent with the execution status. This patch fixes the bug by assigning "-EFAULT" to ret when copy_from_user() returns an unexpected value. Signed-off-by: Pan Bian Signed-off-by: Oded Gabbay Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/amd/amdkfd/kfd_events.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_events.c b/drivers/gpu/drm/amd/amdkfd/kfd_events.c index b6e28dcaea1d..1fb1daa0b366 100644 --- a/drivers/gpu/drm/amd/amdkfd/kfd_events.c +++ b/drivers/gpu/drm/amd/amdkfd/kfd_events.c @@ -739,8 +739,10 @@ int kfd_wait_on_events(struct kfd_process *p, struct kfd_event_data event_data; if (copy_from_user(&event_data, &events[i], - sizeof(struct kfd_event_data))) + sizeof(struct kfd_event_data))) { + ret = -EFAULT; goto fail; + } ret = init_event_waiter(p, &event_waiters[i], event_data.event_id, i); From 0d1b459a0baf87c8cd5ec8d8e1acb7d15fb60b9f Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 12 Jan 2017 14:56:17 +0100 Subject: [PATCH 314/546] USB: serial: mos7720: fix control-message error handling [ Upstream commit 0d130367abf582e7cbf60075c2a7ab53817b1d14 ] Make sure to log an error on short transfers when reading a device register. Also clear the provided buffer (which if often an uninitialised automatic variable) on errors as the driver currently does not bother to check for errors. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7720.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/drivers/usb/serial/mos7720.c b/drivers/usb/serial/mos7720.c index e56cdb436de3..4581fa1dec98 100644 --- a/drivers/usb/serial/mos7720.c +++ b/drivers/usb/serial/mos7720.c @@ -234,11 +234,16 @@ static int read_mos_reg(struct usb_serial *serial, unsigned int serial_portnum, status = usb_control_msg(usbdev, pipe, request, requesttype, value, index, buf, 1, MOS_WDR_TIMEOUT); - if (status == 1) + if (status == 1) { *data = *buf; - else if (status < 0) + } else { dev_err(&usbdev->dev, "mos7720: usb_control_msg() failed: %d\n", status); + if (status >= 0) + status = -EIO; + *data = 0; + } + kfree(buf); return status; From abbccd85575319472e468b009fc0816cdab7a795 Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Thu, 12 Jan 2017 14:56:18 +0100 Subject: [PATCH 315/546] USB: serial: mos7840: fix control-message error handling [ Upstream commit cd8db057e93ddaacbec025b567490555d2bca280 ] Make sure to detect short transfers when reading a device register. The modem-status handling had sufficient error checks in place, but move handling of short transfers into the register accessor function itself for consistency. Reviewed-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/mos7840.c | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/drivers/usb/serial/mos7840.c b/drivers/usb/serial/mos7840.c index d17685cc00c9..ed883a7ad533 100644 --- a/drivers/usb/serial/mos7840.c +++ b/drivers/usb/serial/mos7840.c @@ -285,9 +285,15 @@ static int mos7840_get_reg_sync(struct usb_serial_port *port, __u16 reg, ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, MCS_RD_RTYPE, 0, reg, buf, VENDOR_READ_LENGTH, MOS_WDR_TIMEOUT); + if (ret < VENDOR_READ_LENGTH) { + if (ret >= 0) + ret = -EIO; + goto out; + } + *val = buf[0]; dev_dbg(&port->dev, "%s offset is %x, return val %x\n", __func__, reg, *val); - +out: kfree(buf); return ret; } @@ -353,8 +359,13 @@ static int mos7840_get_uart_reg(struct usb_serial_port *port, __u16 reg, ret = usb_control_msg(dev, usb_rcvctrlpipe(dev, 0), MCS_RDREQ, MCS_RD_RTYPE, Wval, reg, buf, VENDOR_READ_LENGTH, MOS_WDR_TIMEOUT); + if (ret < VENDOR_READ_LENGTH) { + if (ret >= 0) + ret = -EIO; + goto out; + } *val = buf[0]; - +out: kfree(buf); return ret; } @@ -1490,10 +1501,10 @@ static int mos7840_tiocmget(struct tty_struct *tty) return -ENODEV; status = mos7840_get_uart_reg(port, MODEM_STATUS_REGISTER, &msr); - if (status != 1) + if (status < 0) return -EIO; status = mos7840_get_uart_reg(port, MODEM_CONTROL_REGISTER, &mcr); - if (status != 1) + if (status < 0) return -EIO; result = ((mcr & MCR_DTR) ? TIOCM_DTR : 0) | ((mcr & MCR_RTS) ? TIOCM_RTS : 0) From 8e8c3d4bb62950c37f086be7d3d775b4879c30df Mon Sep 17 00:00:00 2001 From: Alden Tondettar Date: Sun, 15 Jan 2017 15:31:56 -0700 Subject: [PATCH 316/546] partitions/efi: Fix integer overflow in GPT size calculation [ Upstream commit c5082b70adfe8e1ea1cf4a8eff92c9f260e364d2 ] If a GUID Partition Table claims to have more than 2**25 entries, the calculation of the partition table size in alloc_read_gpt_entries() will overflow a 32-bit integer and not enough space will be allocated for the table. Nothing seems to get written out of bounds, but later efi_partition() will read up to 32768 bytes from a 128 byte buffer, possibly OOPSing or exposing information to /proc/partitions and uevents. The problem exists on both 64-bit and 32-bit platforms. Fix the overflow and also print a meaningful debug message if the table size is too large. Signed-off-by: Alden Tondettar Acked-by: Ard Biesheuvel Signed-off-by: Jens Axboe Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- block/partitions/efi.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/block/partitions/efi.c b/block/partitions/efi.c index 26cb624ace05..d26d0d27f5fd 100644 --- a/block/partitions/efi.c +++ b/block/partitions/efi.c @@ -293,7 +293,7 @@ static gpt_entry *alloc_read_gpt_entries(struct parsed_partitions *state, if (!gpt) return NULL; - count = le32_to_cpu(gpt->num_partition_entries) * + count = (size_t)le32_to_cpu(gpt->num_partition_entries) * le32_to_cpu(gpt->sizeof_partition_entry); if (!count) return NULL; @@ -352,7 +352,7 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, gpt_header **gpt, gpt_entry **ptes) { u32 crc, origcrc; - u64 lastlba; + u64 lastlba, pt_size; if (!ptes) return 0; @@ -434,13 +434,20 @@ static int is_gpt_valid(struct parsed_partitions *state, u64 lba, goto fail; } + /* Sanity check partition table size */ + pt_size = (u64)le32_to_cpu((*gpt)->num_partition_entries) * + le32_to_cpu((*gpt)->sizeof_partition_entry); + if (pt_size > KMALLOC_MAX_SIZE) { + pr_debug("GUID Partition Table is too large: %llu > %lu bytes\n", + (unsigned long long)pt_size, KMALLOC_MAX_SIZE); + goto fail; + } + if (!(*ptes = alloc_read_gpt_entries(state, *gpt))) goto fail; /* Check the GUID Partition Entry Array CRC */ - crc = efi_crc32((const unsigned char *) (*ptes), - le32_to_cpu((*gpt)->num_partition_entries) * - le32_to_cpu((*gpt)->sizeof_partition_entry)); + crc = efi_crc32((const unsigned char *) (*ptes), pt_size); if (crc != le32_to_cpu((*gpt)->partition_entry_array_crc32)) { pr_debug("GUID Partitition Entry Array CRC check failed.\n"); From af3749456042cc38c80902e849421451b27215f6 Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 13 Jan 2017 10:23:52 +0100 Subject: [PATCH 317/546] ASoC: dapm: handle probe deferrals [ Upstream commit 37e1df8c95e2c8a57c77eafc097648f6e40a60ff ] This starts to handle probe deferrals on regulators and clocks on the ASoC DAPM. I came to this patch after audio stopped working on Ux500 ages ago and I finally looked into it to see what is wrong. I had messages like this in the console since a while back: ab8500-codec.0: ASoC: Failed to request audioclk: -517 ab8500-codec.0: ASoC: Failed to create DAPM control audioclk ab8500-codec.0: Failed to create new controls -12 snd-soc-mop500.0: ASoC: failed to instantiate card -12 snd-soc-mop500.0: Error: snd_soc_register_card failed (-12)! snd-soc-mop500: probe of snd-soc-mop500.0 failed with error -12 Apparently because the widget table for the codec looks like this (sound/soc/codecs/ab8500-codec.c): static const struct snd_soc_dapm_widget ab8500_dapm_widgets[] = { /* Clocks */ SND_SOC_DAPM_CLOCK_SUPPLY("audioclk"), /* Regulators */ SND_SOC_DAPM_REGULATOR_SUPPLY("V-AUD", 0, 0), SND_SOC_DAPM_REGULATOR_SUPPLY("V-AMIC1", 0, 0), SND_SOC_DAPM_REGULATOR_SUPPLY("V-AMIC2", 0, 0), SND_SOC_DAPM_REGULATOR_SUPPLY("V-DMIC", 0, 0), So when we call snd_soc_register_codec() and any of these widgets get a deferred probe we do not get an -EPROBE_DEFER (-517) back as we should and instead we just fail. Apparently the code assumes that clocks and regulators must be available at this point and not defer. After this patch it rather looks like this: ab8500-codec.0: Failed to create new controls -517 snd-soc-mop500.0: ASoC: failed to instantiate card -517 snd-soc-mop500.0: Error: snd_soc_register_card failed (-517)! (...) abx500-clk.0: registered clocks for ab850x snd-soc-mop500.0: ab8500-codec-dai.0 <-> ux500-msp-i2s.1 mapping ok snd-soc-mop500.0: ab8500-codec-dai.1 <-> ux500-msp-i2s.3 mapping ok I'm pretty happy about the patch as it it, but I'm a bit uncertain on how to proceed: there are a lot of users of the external functions snd_soc_dapm_new_control() (111 sites) and that will now return an occassional error pointer, which is not handled in the calling sites. I want an indication from the maintainers whether I should just go in and augment all these call sites, or if deferred probe is frowned upon when it leads to this much overhead. Signed-off-by: Linus Walleij Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-dapm.c | 42 ++++++++++++++++++++++++++++++++++++++++ sound/soc/soc-topology.c | 9 +++++++++ 2 files changed, 51 insertions(+) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index b8a256dfed7e..df036afb2197 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -358,6 +358,10 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, snd_soc_dapm_new_control_unlocked(widget->dapm, &template); kfree(name); + if (IS_ERR(data->widget)) { + ret = PTR_ERR(data->widget); + goto err_data; + } if (!data->widget) { ret = -ENOMEM; goto err_data; @@ -392,6 +396,10 @@ static int dapm_kcontrol_data_alloc(struct snd_soc_dapm_widget *widget, data->widget = snd_soc_dapm_new_control_unlocked( widget->dapm, &template); kfree(name); + if (IS_ERR(data->widget)) { + ret = PTR_ERR(data->widget); + goto err_data; + } if (!data->widget) { ret = -ENOMEM; goto err_data; @@ -3278,11 +3286,22 @@ snd_soc_dapm_new_control(struct snd_soc_dapm_context *dapm, mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_RUNTIME); w = snd_soc_dapm_new_control_unlocked(dapm, widget); + /* Do not nag about probe deferrals */ + if (IS_ERR(w)) { + int ret = PTR_ERR(w); + + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, + "ASoC: Failed to create DAPM control %s (%d)\n", + widget->name, ret); + goto out_unlock; + } if (!w) dev_err(dapm->dev, "ASoC: Failed to create DAPM control %s\n", widget->name); +out_unlock: mutex_unlock(&dapm->card->dapm_mutex); return w; } @@ -3304,6 +3323,8 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, w->regulator = devm_regulator_get(dapm->dev, w->name); if (IS_ERR(w->regulator)) { ret = PTR_ERR(w->regulator); + if (ret == -EPROBE_DEFER) + return ERR_PTR(ret); dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", w->name, ret); return NULL; @@ -3322,6 +3343,8 @@ snd_soc_dapm_new_control_unlocked(struct snd_soc_dapm_context *dapm, w->clk = devm_clk_get(dapm->dev, w->name); if (IS_ERR(w->clk)) { ret = PTR_ERR(w->clk); + if (ret == -EPROBE_DEFER) + return ERR_PTR(ret); dev_err(dapm->dev, "ASoC: Failed to request %s: %d\n", w->name, ret); return NULL; @@ -3435,6 +3458,16 @@ int snd_soc_dapm_new_controls(struct snd_soc_dapm_context *dapm, mutex_lock_nested(&dapm->card->dapm_mutex, SND_SOC_DAPM_CLASS_INIT); for (i = 0; i < num; i++) { w = snd_soc_dapm_new_control_unlocked(dapm, widget); + if (IS_ERR(w)) { + ret = PTR_ERR(w); + /* Do not nag about probe deferrals */ + if (ret == -EPROBE_DEFER) + break; + dev_err(dapm->dev, + "ASoC: Failed to create DAPM control %s (%d)\n", + widget->name, ret); + break; + } if (!w) { dev_err(dapm->dev, "ASoC: Failed to create DAPM control %s\n", @@ -3701,6 +3734,15 @@ int snd_soc_dapm_new_pcm(struct snd_soc_card *card, dev_dbg(card->dev, "ASoC: adding %s widget\n", link_name); w = snd_soc_dapm_new_control_unlocked(&card->dapm, &template); + if (IS_ERR(w)) { + ret = PTR_ERR(w); + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(card->dev, + "ASoC: Failed to create %s widget (%d)\n", + link_name, ret); + goto outfree_kcontrol_news; + } if (!w) { dev_err(card->dev, "ASoC: Failed to create %s widget\n", link_name); diff --git a/sound/soc/soc-topology.c b/sound/soc/soc-topology.c index 70396d3f6472..e3f34a86413c 100644 --- a/sound/soc/soc-topology.c +++ b/sound/soc/soc-topology.c @@ -1481,6 +1481,15 @@ static int soc_tplg_dapm_widget_create(struct soc_tplg *tplg, widget = snd_soc_dapm_new_control(dapm, &template); else widget = snd_soc_dapm_new_control_unlocked(dapm, &template); + if (IS_ERR(widget)) { + ret = PTR_ERR(widget); + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(tplg->dev, + "ASoC: failed to create widget %s controls (%d)\n", + w->name, ret); + goto hdr_err; + } if (widget == NULL) { dev_err(tplg->dev, "ASoC: failed to create widget %s controls\n", w->name); From 093fe104c5bbd99b4934bd80216b6e6d7371c4fc Mon Sep 17 00:00:00 2001 From: Richard Guy Briggs Date: Tue, 17 Jan 2017 11:07:15 -0500 Subject: [PATCH 318/546] audit: log 32-bit socketcalls [ Upstream commit 62bc306e2083436675e33b5bdeb6a77907d35971 ] 32-bit socketcalls were not being logged by audit on x86_64 systems. Log them. This is basically a duplicate of the call from net/socket.c:sys_socketcall(), but it addresses the impedance mismatch between 32-bit userspace process and 64-bit kernel audit. See: https://github.com/linux-audit/audit-kernel/issues/14 Signed-off-by: Richard Guy Briggs Acked-by: David S. Miller Signed-off-by: Paul Moore Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- include/linux/audit.h | 20 ++++++++++++++++++++ net/compat.c | 17 ++++++++++++++--- 2 files changed, 34 insertions(+), 3 deletions(-) diff --git a/include/linux/audit.h b/include/linux/audit.h index 20eba1eb0a3c..faac391badac 100644 --- a/include/linux/audit.h +++ b/include/linux/audit.h @@ -281,6 +281,20 @@ static inline int audit_socketcall(int nargs, unsigned long *args) return __audit_socketcall(nargs, args); return 0; } + +static inline int audit_socketcall_compat(int nargs, u32 *args) +{ + unsigned long a[AUDITSC_ARGS]; + int i; + + if (audit_dummy_context()) + return 0; + + for (i = 0; i < nargs; i++) + a[i] = (unsigned long)args[i]; + return __audit_socketcall(nargs, a); +} + static inline int audit_sockaddr(int len, void *addr) { if (unlikely(!audit_dummy_context())) @@ -407,6 +421,12 @@ static inline int audit_socketcall(int nargs, unsigned long *args) { return 0; } + +static inline int audit_socketcall_compat(int nargs, u32 *args) +{ + return 0; +} + static inline void audit_fd_pair(int fd1, int fd2) { } static inline int audit_sockaddr(int len, void *addr) diff --git a/net/compat.c b/net/compat.c index 5cfd26a0006f..0ccf3ecf6bbb 100644 --- a/net/compat.c +++ b/net/compat.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include @@ -767,14 +768,24 @@ COMPAT_SYSCALL_DEFINE5(recvmmsg, int, fd, struct compat_mmsghdr __user *, mmsg, COMPAT_SYSCALL_DEFINE2(socketcall, int, call, u32 __user *, args) { - int ret; - u32 a[6]; + u32 a[AUDITSC_ARGS]; + unsigned int len; u32 a0, a1; + int ret; if (call < SYS_SOCKET || call > SYS_SENDMMSG) return -EINVAL; - if (copy_from_user(a, args, nas[call])) + len = nas[call]; + if (len > sizeof(a)) + return -EINVAL; + + if (copy_from_user(a, args, len)) return -EFAULT; + + ret = audit_socketcall_compat(len / sizeof(a[0]), a); + if (ret) + return ret; + a0 = a[0]; a1 = a[1]; From 7b8c9e6e0fca80c0da2ab94b5b1f96841f0c9be4 Mon Sep 17 00:00:00 2001 From: Peter Chen Date: Wed, 19 Oct 2016 15:32:58 +0800 Subject: [PATCH 319/546] usb: chipidea: vbus event may exist before starting gadget [ Upstream commit c3b674a04b8ab62a1d35e86714d466af0a0ecc18 ] At some situations, the vbus may already be there before starting gadget. So we need to check vbus event after switching to gadget in order to handle missing vbus event. The typical use cases are plugging vbus cable before driver load or the vbus has already been there after stopping host but before starting gadget. Signed-off-by: Peter Chen Tested-by: Stephen Boyd Reported-by: Stephen Boyd Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/usb/chipidea/otg.c | 17 ++++++++++++----- 1 file changed, 12 insertions(+), 5 deletions(-) diff --git a/drivers/usb/chipidea/otg.c b/drivers/usb/chipidea/otg.c index 0cf149edddd8..f36a1ac3bfbd 100644 --- a/drivers/usb/chipidea/otg.c +++ b/drivers/usb/chipidea/otg.c @@ -134,9 +134,9 @@ void ci_handle_vbus_change(struct ci_hdrc *ci) if (!ci->is_otg) return; - if (hw_read_otgsc(ci, OTGSC_BSV)) + if (hw_read_otgsc(ci, OTGSC_BSV) && !ci->vbus_active) usb_gadget_vbus_connect(&ci->gadget); - else + else if (!hw_read_otgsc(ci, OTGSC_BSV) && ci->vbus_active) usb_gadget_vbus_disconnect(&ci->gadget); } @@ -175,14 +175,21 @@ static void ci_handle_id_switch(struct ci_hdrc *ci) ci_role_stop(ci); - if (role == CI_ROLE_GADGET) + if (role == CI_ROLE_GADGET && + IS_ERR(ci->platdata->vbus_extcon.edev)) /* - * wait vbus lower than OTGSC_BSV before connecting - * to host + * Wait vbus lower than OTGSC_BSV before connecting + * to host. If connecting status is from an external + * connector instead of register, we don't need to + * care vbus on the board, since it will not affect + * external connector status. */ hw_wait_vbus_lower_bsv(ci); ci_role_start(ci, role); + /* vbus change may have already occurred */ + if (role == CI_ROLE_GADGET) + ci_handle_vbus_change(ci); } } /** From c5710390cc7605f66cef6ce34e0635fc464bfd9b Mon Sep 17 00:00:00 2001 From: Linus Walleij Date: Fri, 20 Jan 2017 14:07:52 +0100 Subject: [PATCH 320/546] ASoC: dapm: fix some pointer error handling [ Upstream commit 639467c8f26d834c934215e8b59129ce442475fe ] commit 66feeec9322132689d42723df2537d60f96f8e44 "RFC: ASoC: dapm: handle probe deferrals" forgot a to update some two sites where the call was used. The static codechecks quickly found them. Reported-by: Dan Carpenter Fixes: 66feeec93221 ("RFC: ASoC: dapm: handle probe deferrals") Signed-off-by: Linus Walleij Signed-off-by: Mark Brown Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- sound/soc/soc-dapm.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c index df036afb2197..6a438a361592 100644 --- a/sound/soc/soc-dapm.c +++ b/sound/soc/soc-dapm.c @@ -3794,6 +3794,16 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, template.name); w = snd_soc_dapm_new_control_unlocked(dapm, &template); + if (IS_ERR(w)) { + int ret = PTR_ERR(w); + + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, + "ASoC: Failed to create %s widget (%d)\n", + dai->driver->playback.stream_name, ret); + return ret; + } if (!w) { dev_err(dapm->dev, "ASoC: Failed to create %s widget\n", dai->driver->playback.stream_name); @@ -3813,6 +3823,16 @@ int snd_soc_dapm_new_dai_widgets(struct snd_soc_dapm_context *dapm, template.name); w = snd_soc_dapm_new_control_unlocked(dapm, &template); + if (IS_ERR(w)) { + int ret = PTR_ERR(w); + + /* Do not nag about probe deferrals */ + if (ret != -EPROBE_DEFER) + dev_err(dapm->dev, + "ASoC: Failed to create %s widget (%d)\n", + dai->driver->playback.stream_name, ret); + return ret; + } if (!w) { dev_err(dapm->dev, "ASoC: Failed to create %s widget\n", dai->driver->capture.stream_name); From 4e6cdc0a7decd1cc5396b7d28c3ff8d47e3ae78b Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 17 Jan 2017 16:18:40 +0100 Subject: [PATCH 321/546] MIPS: Lantiq: Fix another request_mem_region() return code check [ Upstream commit 98ea51cb0c8ce009d9da1fd7b48f0ff1d7a9bbb0 ] Hauke already fixed a couple of them, but one instance remains that checks for a negative integer when it should check for a NULL pointer: arch/mips/lantiq/xway/sysctrl.c: In function 'ltq_soc_init': arch/mips/lantiq/xway/sysctrl.c:473:19: error: ordered comparison of pointer with integer zero [-Werror=extra] Fixes: 6e807852676a ("MIPS: Lantiq: Fix check for return value of request_mem_region()") Signed-off-by: Arnd Bergmann Cc: John Crispin Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/15043/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/lantiq/xway/sysctrl.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/mips/lantiq/xway/sysctrl.c b/arch/mips/lantiq/xway/sysctrl.c index daf580ce5ca2..2528181232fd 100644 --- a/arch/mips/lantiq/xway/sysctrl.c +++ b/arch/mips/lantiq/xway/sysctrl.c @@ -469,8 +469,8 @@ void __init ltq_soc_init(void) panic("Failed to load xbar nodes from devicetree"); if (of_address_to_resource(np_xbar, 0, &res_xbar)) panic("Failed to get xbar resources"); - if (request_mem_region(res_xbar.start, resource_size(&res_xbar), - res_xbar.name) < 0) + if (!request_mem_region(res_xbar.start, resource_size(&res_xbar), + res_xbar.name)) panic("Failed to get xbar resources"); ltq_xbar_membase = ioremap_nocache(res_xbar.start, From b9ff317b5cd4a8f7ca13934af679e129c3f1d2ce Mon Sep 17 00:00:00 2001 From: Myungho Jung Date: Tue, 25 Apr 2017 11:58:15 -0700 Subject: [PATCH 322/546] net: core: Prevent from dereferencing null pointer when releasing SKB [ Upstream commit 9899886d5e8ec5b343b1efe44f185a0e68dc6454 ] Added NULL check to make __dev_kfree_skb_irq consistent with kfree family of functions. Link: https://bugzilla.kernel.org/show_bug.cgi?id=195289 Signed-off-by: Myungho Jung Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/core/dev.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/net/core/dev.c b/net/core/dev.c index 24d243084aab..dac52fa60f25 100644 --- a/net/core/dev.c +++ b/net/core/dev.c @@ -2338,6 +2338,9 @@ void __dev_kfree_skb_irq(struct sk_buff *skb, enum skb_free_reason reason) { unsigned long flags; + if (unlikely(!skb)) + return; + if (likely(atomic_read(&skb->users) == 1)) { smp_rmb(); atomic_set(&skb->users, 0); From fa63895f47c9253a0305a5d0862e98ab6f11e718 Mon Sep 17 00:00:00 2001 From: Alexander Potapenko Date: Tue, 25 Apr 2017 18:51:46 +0200 Subject: [PATCH 323/546] net/packet: check length in getsockopt() called with PACKET_HDRLEN [ Upstream commit fd2c83b35752f0a8236b976978ad4658df14a59f ] In the case getsockopt() is called with PACKET_HDRLEN and optlen < 4 |val| remains uninitialized and the syscall may behave differently depending on its value, and even copy garbage to userspace on certain architectures. To fix this we now return -EINVAL if optlen is too small. This bug has been detected with KMSAN. Signed-off-by: Alexander Potapenko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/packet/af_packet.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index 148ec130d99d..b70055fc30cb 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -3802,6 +3802,8 @@ static int packet_getsockopt(struct socket *sock, int level, int optname, case PACKET_HDRLEN: if (len > sizeof(int)) len = sizeof(int); + if (len < sizeof(int)) + return -EINVAL; if (copy_from_user(&val, optval, len)) return -EFAULT; switch (val) { From 4212115da67bcaacb134def45c25f21666bbace9 Mon Sep 17 00:00:00 2001 From: Pan Bian Date: Mon, 24 Apr 2017 18:29:16 +0800 Subject: [PATCH 324/546] team: fix memory leaks [ Upstream commit 72ec0bc64b9a5d8e0efcb717abfc757746b101b7 ] In functions team_nl_send_port_list_get() and team_nl_send_options_get(), pointer skb keeps the return value of nlmsg_new(). When the call to genlmsg_put() fails, the memory is not freed(). This will result in memory leak bugs. Fixes: 9b00cf2d1024 ("team: implement multipart netlink messages for options transfers") Signed-off-by: Pan Bian Acked-by: Jiri Pirko Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/team/team.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/net/team/team.c b/drivers/net/team/team.c index a5f392ae30d5..61cd53838360 100644 --- a/drivers/net/team/team.c +++ b/drivers/net/team/team.c @@ -2343,8 +2343,10 @@ static int team_nl_send_options_get(struct team *team, u32 portid, u32 seq, hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_OPTIONS_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(skb); return -EMSGSIZE; + } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; @@ -2611,8 +2613,10 @@ static int team_nl_send_port_list_get(struct team *team, u32 portid, u32 seq, hdr = genlmsg_put(skb, portid, seq, &team_nl_family, flags | NLM_F_MULTI, TEAM_CMD_PORT_LIST_GET); - if (!hdr) + if (!hdr) { + nlmsg_free(skb); return -EMSGSIZE; + } if (nla_put_u32(skb, TEAM_ATTR_TEAM_IFINDEX, team->dev->ifindex)) goto nla_put_failure; From e1e99dc319ccb5403f1264fe411fa3b637c6d783 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Roman=20Spycha=C5=82a?= Date: Thu, 20 Apr 2017 12:04:10 +0200 Subject: [PATCH 325/546] usb: plusb: Add support for PL-27A1 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit [ Upstream commit 6f2aee0c0de65013333bbc26fe50c9c7b09a37f7 ] This patch adds support for the PL-27A1 by adding the appropriate USB ID's. This chip is used in the goobay Active USB 3.0 Data Link and Unitek Y-3501 cables. Signed-off-by: Roman Spychała Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/Kconfig | 2 +- drivers/net/usb/plusb.c | 15 +++++++++++++-- 2 files changed, 14 insertions(+), 3 deletions(-) diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 7f83504dfa69..1f6893ebce16 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -364,7 +364,7 @@ config USB_NET_NET1080 optionally with LEDs that indicate traffic config USB_NET_PLUSB - tristate "Prolific PL-2301/2302/25A1 based cables" + tristate "Prolific PL-2301/2302/25A1/27A1 based cables" # if the handshake/init/reset problems, from original 'plusb', # are ever resolved ... then remove "experimental" depends on USB_USBNET diff --git a/drivers/net/usb/plusb.c b/drivers/net/usb/plusb.c index 1bfe0fcaccf5..7c02231c1a1b 100644 --- a/drivers/net/usb/plusb.c +++ b/drivers/net/usb/plusb.c @@ -102,7 +102,7 @@ static int pl_reset(struct usbnet *dev) } static const struct driver_info prolific_info = { - .description = "Prolific PL-2301/PL-2302/PL-25A1", + .description = "Prolific PL-2301/PL-2302/PL-25A1/PL-27A1", .flags = FLAG_POINTTOPOINT | FLAG_NO_SETINT, /* some PL-2302 versions seem to fail usb_set_interface() */ .reset = pl_reset, @@ -139,6 +139,17 @@ static const struct usb_device_id products [] = { * Host-to-Host Cable */ .driver_info = (unsigned long) &prolific_info, + +}, + +/* super speed cables */ +{ + USB_DEVICE(0x067b, 0x27a1), /* PL-27A1, no eeprom + * also: goobay Active USB 3.0 + * Data Link, + * Unitek Y-3501 + */ + .driver_info = (unsigned long) &prolific_info, }, { }, // END @@ -158,5 +169,5 @@ static struct usb_driver plusb_driver = { module_usb_driver(plusb_driver); MODULE_AUTHOR("David Brownell"); -MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1 USB Host to Host Link Driver"); +MODULE_DESCRIPTION("Prolific PL-2301/2302/25A1/27A1 USB Host to Host Link Driver"); MODULE_LICENSE("GPL"); From 1b760fdad9f0626e7be2c1082c94dca21f72470b Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Wed, 29 Mar 2017 20:54:37 +0200 Subject: [PATCH 326/546] mmc: sdio: fix alignment issue in struct sdio_func [ Upstream commit 5ef1ecf060f28ecef313b5723f1fd39bf5a35f56 ] Certain 64-bit systems (e.g. Amlogic Meson GX) require buffers to be used for DMA to be 8-byte-aligned. struct sdio_func has an embedded small DMA buffer not meeting this requirement. When testing switching to descriptor chain mode in meson-gx driver SDIO is broken therefore. Fix this by allocating the small DMA buffer separately as kmalloc ensures that the returned memory area is properly aligned for every basic data type. Signed-off-by: Heiner Kallweit Tested-by: Helmut Klein Signed-off-by: Ulf Hansson Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/mmc/core/sdio_bus.c | 12 +++++++++++- include/linux/mmc/sdio_func.h | 2 +- 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 7e327a6dd53d..c23bc4f331bd 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -266,7 +266,7 @@ static void sdio_release_func(struct device *dev) sdio_free_func_cis(func); kfree(func->info); - + kfree(func->tmpbuf); kfree(func); } @@ -281,6 +281,16 @@ struct sdio_func *sdio_alloc_func(struct mmc_card *card) if (!func) return ERR_PTR(-ENOMEM); + /* + * allocate buffer separately to make sure it's properly aligned for + * DMA usage (incl. 64 bit DMA) + */ + func->tmpbuf = kmalloc(4, GFP_KERNEL); + if (!func->tmpbuf) { + kfree(func); + return ERR_PTR(-ENOMEM); + } + func->card = card; device_initialize(&func->dev); diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index aab032a6ae61..97ca105347a6 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -53,7 +53,7 @@ struct sdio_func { unsigned int state; /* function state */ #define SDIO_STATE_PRESENT (1<<0) /* present in sysfs */ - u8 tmpbuf[4]; /* DMA:able scratch buffer */ + u8 *tmpbuf; /* DMA:able scratch buffer */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ From e29066778bc28eff5f63616800c6b60f12c87267 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Mon, 10 Apr 2017 14:59:28 +0300 Subject: [PATCH 327/546] bridge: netlink: register netdevice before executing changelink [ Upstream commit 5b8d5429daa05bebef6ffd3297df3b502cc6f184 ] Peter reported a kernel oops when executing the following command: $ ip link add name test type bridge vlan_default_pvid 1 [13634.939408] BUG: unable to handle kernel NULL pointer dereference at 0000000000000190 [13634.939436] IP: __vlan_add+0x73/0x5f0 [...] [13634.939783] Call Trace: [13634.939791] ? pcpu_next_unpop+0x3b/0x50 [13634.939801] ? pcpu_alloc+0x3d2/0x680 [13634.939810] ? br_vlan_add+0x135/0x1b0 [13634.939820] ? __br_vlan_set_default_pvid.part.28+0x204/0x2b0 [13634.939834] ? br_changelink+0x120/0x4e0 [13634.939844] ? br_dev_newlink+0x50/0x70 [13634.939854] ? rtnl_newlink+0x5f5/0x8a0 [13634.939864] ? rtnl_newlink+0x176/0x8a0 [13634.939874] ? mem_cgroup_commit_charge+0x7c/0x4e0 [13634.939886] ? rtnetlink_rcv_msg+0xe1/0x220 [13634.939896] ? lookup_fast+0x52/0x370 [13634.939905] ? rtnl_newlink+0x8a0/0x8a0 [13634.939915] ? netlink_rcv_skb+0xa1/0xc0 [13634.939925] ? rtnetlink_rcv+0x24/0x30 [13634.939934] ? netlink_unicast+0x177/0x220 [13634.939944] ? netlink_sendmsg+0x2fe/0x3b0 [13634.939954] ? _copy_from_user+0x39/0x40 [13634.939964] ? sock_sendmsg+0x30/0x40 [13634.940159] ? ___sys_sendmsg+0x29d/0x2b0 [13634.940326] ? __alloc_pages_nodemask+0xdf/0x230 [13634.940478] ? mem_cgroup_commit_charge+0x7c/0x4e0 [13634.940592] ? mem_cgroup_try_charge+0x76/0x1a0 [13634.940701] ? __handle_mm_fault+0xdb9/0x10b0 [13634.940809] ? __sys_sendmsg+0x51/0x90 [13634.940917] ? entry_SYSCALL_64_fastpath+0x1e/0xad The problem is that the bridge's VLAN group is created after setting the default PVID, when registering the netdevice and executing its ndo_init(). Fix this by changing the order of both operations, so that br_changelink() is only processed after the netdevice is registered, when the VLAN group is already initialized. Fixes: b6677449dff6 ("bridge: netlink: call br_changelink() during br_dev_newlink()") Signed-off-by: Nikolay Aleksandrov Signed-off-by: Ido Schimmel Reported-by: Peter V. Saveliev Tested-by: Peter V. Saveliev Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/bridge/br_netlink.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c index ff8bb41d713f..a1f697ec4fc2 100644 --- a/net/bridge/br_netlink.c +++ b/net/bridge/br_netlink.c @@ -1073,11 +1073,14 @@ static int br_dev_newlink(struct net *src_net, struct net_device *dev, spin_unlock_bh(&br->lock); } - err = br_changelink(dev, tb, data); + err = register_netdevice(dev); if (err) return err; - return register_netdevice(dev); + err = br_changelink(dev, tb, data); + if (err) + unregister_netdevice(dev); + return err; } static size_t br_get_size(const struct net_device *brdev) From f7f46b3ba20def4321bfbe3a88cd1ae23eb40b07 Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sat, 25 Mar 2017 08:53:12 +0800 Subject: [PATCH 328/546] netfilter: invoke synchronize_rcu after set the _hook_ to NULL [ Upstream commit 3b7dabf029478bb80507a6c4500ca94132a2bc0b ] Otherwise, another CPU may access the invalid pointer. For example: CPU0 CPU1 - rcu_read_lock(); - pfunc = _hook_; _hook_ = NULL; - mod unload - - pfunc(); // invalid, panic - rcu_read_unlock(); So we must call synchronize_rcu() to wait the rcu reader to finish. Also note, in nf_nat_snmp_basic_fini, synchronize_rcu() will be invoked by later nf_conntrack_helper_unregister, but I'm inclined to add a explicit synchronize_rcu after set the nf_nat_snmp_hook to NULL. Depend on such obscure assumptions is not a good idea. Last, in nfnetlink_cttimeout, we use kfree_rcu to free the time object, so in cttimeout_exit, invoking rcu_barrier() is not necessary at all, remove it too. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/ipv4/netfilter/nf_nat_snmp_basic.c | 1 + net/netfilter/nf_conntrack_ecache.c | 2 ++ net/netfilter/nf_conntrack_netlink.c | 1 + net/netfilter/nf_nat_core.c | 2 ++ net/netfilter/nfnetlink_cttimeout.c | 2 +- 5 files changed, 7 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c index ddb894ac1458..2689c9c4f1a0 100644 --- a/net/ipv4/netfilter/nf_nat_snmp_basic.c +++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c @@ -1304,6 +1304,7 @@ static int __init nf_nat_snmp_basic_init(void) static void __exit nf_nat_snmp_basic_fini(void) { RCU_INIT_POINTER(nf_nat_snmp_hook, NULL); + synchronize_rcu(); nf_conntrack_helper_unregister(&snmp_trap_helper); } diff --git a/net/netfilter/nf_conntrack_ecache.c b/net/netfilter/nf_conntrack_ecache.c index 4e78c57b818f..f3b92ce463b0 100644 --- a/net/netfilter/nf_conntrack_ecache.c +++ b/net/netfilter/nf_conntrack_ecache.c @@ -200,6 +200,7 @@ void nf_conntrack_unregister_notifier(struct net *net, BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_conntrack_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); + /* synchronize_rcu() is called from ctnetlink_exit. */ } EXPORT_SYMBOL_GPL(nf_conntrack_unregister_notifier); @@ -236,6 +237,7 @@ void nf_ct_expect_unregister_notifier(struct net *net, BUG_ON(notify != new); RCU_INIT_POINTER(net->ct.nf_expect_event_cb, NULL); mutex_unlock(&nf_ct_ecache_mutex); + /* synchronize_rcu() is called from ctnetlink_exit. */ } EXPORT_SYMBOL_GPL(nf_ct_expect_unregister_notifier); diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c index e565b2becb14..660939df7c94 100644 --- a/net/netfilter/nf_conntrack_netlink.c +++ b/net/netfilter/nf_conntrack_netlink.c @@ -3415,6 +3415,7 @@ static void __exit ctnetlink_exit(void) #ifdef CONFIG_NETFILTER_NETLINK_GLUE_CT RCU_INIT_POINTER(nfnl_ct_hook, NULL); #endif + synchronize_rcu(); } module_init(ctnetlink_init); diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c index 06a9f45771ab..44516c90118a 100644 --- a/net/netfilter/nf_nat_core.c +++ b/net/netfilter/nf_nat_core.c @@ -892,6 +892,8 @@ static void __exit nf_nat_cleanup(void) #ifdef CONFIG_XFRM RCU_INIT_POINTER(nf_nat_decode_session_hook, NULL); #endif + synchronize_rcu(); + for (i = 0; i < NFPROTO_NUMPROTO; i++) kfree(nf_nat_l4protos[i]); synchronize_net(); diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c index c7a2d0e1c462..ed9153bd7e73 100644 --- a/net/netfilter/nfnetlink_cttimeout.c +++ b/net/netfilter/nfnetlink_cttimeout.c @@ -611,8 +611,8 @@ static void __exit cttimeout_exit(void) #ifdef CONFIG_NF_CONNTRACK_TIMEOUT RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL); RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL); + synchronize_rcu(); #endif /* CONFIG_NF_CONNTRACK_TIMEOUT */ - rcu_barrier(); } module_init(cttimeout_init); From 8bd7216d338694126aec35d936b02b300289b6ae Mon Sep 17 00:00:00 2001 From: Matt Redfearn Date: Tue, 21 Mar 2017 14:52:25 +0000 Subject: [PATCH 329/546] MIPS: IRQ Stack: Unwind IRQ stack onto task stack [ Upstream commit db8466c581cca1a08b505f1319c3ecd246f16fa8 ] When the separate IRQ stack was introduced, stack unwinding only proceeded as far as the top of the IRQ stack, leading to kernel backtraces being less useful, lacking the trace of what was interrupted. Fix this by providing a means for the kernel to unwind the IRQ stack onto the interrupted task stack. The processor state is saved to the kernel task stack on interrupt. The IRQ_STACK_START macro reserves an unsigned long at the top of the IRQ stack where the interrupted task stack pointer can be saved. After the active stack is switched to the IRQ stack, save the interrupted tasks stack pointer to the reserved location. Fix the stack unwinding code to look for the frame being the top of the IRQ stack and if so get the next frame from the saved location. The existing test does not work with the separate stack since the ra is no longer pointed at ret_from_{irq,exception}. The test to stop unwinding the stack 32 bytes from the top of a stack must be modified to allow unwinding to continue up to the location of the saved task stack pointer when on the IRQ stack. The low / high marks of the stack are set depending on whether the sp is on an irq stack or not. Signed-off-by: Matt Redfearn Cc: Paolo Bonzini Cc: Marcin Nowakowski Cc: Masanari Iida Cc: Chris Metcalf Cc: James Hogan Cc: Paul Burton Cc: Ingo Molnar Cc: Jason A. Donenfeld Cc: Andrew Morton Cc: linux-mips@linux-mips.org Cc: linux-kernel@vger.kernel.org Patchwork: https://patchwork.linux-mips.org/patch/15788/ Signed-off-by: Ralf Baechle Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/mips/include/asm/irq.h | 15 +++++++++ arch/mips/kernel/asm-offsets.c | 1 + arch/mips/kernel/genex.S | 8 +++-- arch/mips/kernel/process.c | 56 +++++++++++++++++++++++----------- 4 files changed, 60 insertions(+), 20 deletions(-) diff --git a/arch/mips/include/asm/irq.h b/arch/mips/include/asm/irq.h index ebb9efb02502..77edb22f855d 100644 --- a/arch/mips/include/asm/irq.h +++ b/arch/mips/include/asm/irq.h @@ -18,9 +18,24 @@ #include #define IRQ_STACK_SIZE THREAD_SIZE +#define IRQ_STACK_START (IRQ_STACK_SIZE - sizeof(unsigned long)) extern void *irq_stack[NR_CPUS]; +/* + * The highest address on the IRQ stack contains a dummy frame put down in + * genex.S (handle_int & except_vec_vi_handler) which is structured as follows: + * + * top ------------ + * | task sp | <- irq_stack[cpu] + IRQ_STACK_START + * ------------ + * | | <- First frame of IRQ context + * ------------ + * + * task sp holds a copy of the task stack pointer where the struct pt_regs + * from exception entry can be found. + */ + static inline bool on_irq_stack(int cpu, unsigned long sp) { unsigned long low = (unsigned long)irq_stack[cpu]; diff --git a/arch/mips/kernel/asm-offsets.c b/arch/mips/kernel/asm-offsets.c index ec053ce7bb38..7ab8004c1659 100644 --- a/arch/mips/kernel/asm-offsets.c +++ b/arch/mips/kernel/asm-offsets.c @@ -102,6 +102,7 @@ void output_thread_info_defines(void) DEFINE(_THREAD_SIZE, THREAD_SIZE); DEFINE(_THREAD_MASK, THREAD_MASK); DEFINE(_IRQ_STACK_SIZE, IRQ_STACK_SIZE); + DEFINE(_IRQ_STACK_START, IRQ_STACK_START); BLANK(); } diff --git a/arch/mips/kernel/genex.S b/arch/mips/kernel/genex.S index 619e30e2c4f0..bb72f3ce7e29 100644 --- a/arch/mips/kernel/genex.S +++ b/arch/mips/kernel/genex.S @@ -216,9 +216,11 @@ NESTED(handle_int, PT_SIZE, sp) beq t0, t1, 2f /* Switch to IRQ stack */ - li t1, _IRQ_STACK_SIZE + li t1, _IRQ_STACK_START PTR_ADD sp, t0, t1 + /* Save task's sp on IRQ stack so that unwinding can follow it */ + LONG_S s1, 0(sp) 2: jal plat_irq_dispatch @@ -326,9 +328,11 @@ NESTED(except_vec_vi_handler, 0, sp) beq t0, t1, 2f /* Switch to IRQ stack */ - li t1, _IRQ_STACK_SIZE + li t1, _IRQ_STACK_START PTR_ADD sp, t0, t1 + /* Save task's sp on IRQ stack so that unwinding can follow it */ + LONG_S s1, 0(sp) 2: jalr v0 diff --git a/arch/mips/kernel/process.c b/arch/mips/kernel/process.c index 8c26ecac930d..477ba026c3e5 100644 --- a/arch/mips/kernel/process.c +++ b/arch/mips/kernel/process.c @@ -483,31 +483,52 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page, unsigned long pc, unsigned long *ra) { + unsigned long low, high, irq_stack_high; struct mips_frame_info info; unsigned long size, ofs; + struct pt_regs *regs; int leaf; - extern void ret_from_irq(void); - extern void ret_from_exception(void); if (!stack_page) return 0; /* - * If we reached the bottom of interrupt context, - * return saved pc in pt_regs. + * IRQ stacks start at IRQ_STACK_START + * task stacks at THREAD_SIZE - 32 */ - if (pc == (unsigned long)ret_from_irq || - pc == (unsigned long)ret_from_exception) { - struct pt_regs *regs; - if (*sp >= stack_page && - *sp + sizeof(*regs) <= stack_page + THREAD_SIZE - 32) { - regs = (struct pt_regs *)*sp; - pc = regs->cp0_epc; - if (!user_mode(regs) && __kernel_text_address(pc)) { - *sp = regs->regs[29]; - *ra = regs->regs[31]; - return pc; - } + low = stack_page; + if (!preemptible() && on_irq_stack(raw_smp_processor_id(), *sp)) { + high = stack_page + IRQ_STACK_START; + irq_stack_high = high; + } else { + high = stack_page + THREAD_SIZE - 32; + irq_stack_high = 0; + } + + /* + * If we reached the top of the interrupt stack, start unwinding + * the interrupted task stack. + */ + if (unlikely(*sp == irq_stack_high)) { + unsigned long task_sp = *(unsigned long *)*sp; + + /* + * Check that the pointer saved in the IRQ stack head points to + * something within the stack of the current task + */ + if (!object_is_on_stack((void *)task_sp)) + return 0; + + /* + * Follow pointer to tasks kernel stack frame where interrupted + * state was saved. + */ + regs = (struct pt_regs *)task_sp; + pc = regs->cp0_epc; + if (!user_mode(regs) && __kernel_text_address(pc)) { + *sp = regs->regs[29]; + *ra = regs->regs[31]; + return pc; } return 0; } @@ -528,8 +549,7 @@ unsigned long notrace unwind_stack_by_address(unsigned long stack_page, if (leaf < 0) return 0; - if (*sp < stack_page || - *sp + info.frame_size > stack_page + THREAD_SIZE - 32) + if (*sp < low || *sp + info.frame_size > high) return 0; if (leaf) From fa029020bddd92f62b3acb4ec39b8bbc66746950 Mon Sep 17 00:00:00 2001 From: Thibault Saunier Date: Wed, 1 Feb 2017 18:05:21 -0200 Subject: [PATCH 330/546] exynos-gsc: Do not swap cb/cr for semi planar formats [ Upstream commit d7f3e33df4fbdc9855fb151f4a328ec46447e3ba ] In the case of semi planar formats cb and cr are in the same plane in memory, meaning that will be set to 'cb' whatever the format is, and whatever the (packed) order of those components are. Suggested-by: Nicolas Dufresne Signed-off-by: Thibault Saunier Signed-off-by: Javier Martinez Canillas Acked-by: Sylwester Nawrocki Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/media/platform/exynos-gsc/gsc-core.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/drivers/media/platform/exynos-gsc/gsc-core.c b/drivers/media/platform/exynos-gsc/gsc-core.c index 9b9e423e4fc4..15c543d4b366 100644 --- a/drivers/media/platform/exynos-gsc/gsc-core.c +++ b/drivers/media/platform/exynos-gsc/gsc-core.c @@ -849,9 +849,7 @@ int gsc_prepare_addr(struct gsc_ctx *ctx, struct vb2_buffer *vb, if ((frame->fmt->pixelformat == V4L2_PIX_FMT_VYUY) || (frame->fmt->pixelformat == V4L2_PIX_FMT_YVYU) || - (frame->fmt->pixelformat == V4L2_PIX_FMT_NV61) || (frame->fmt->pixelformat == V4L2_PIX_FMT_YVU420) || - (frame->fmt->pixelformat == V4L2_PIX_FMT_NV21) || (frame->fmt->pixelformat == V4L2_PIX_FMT_YVU420M)) swap(addr->cb, addr->cr); From 4203f2a73882455c0e0c01d7a58a6e24a8cd43ac Mon Sep 17 00:00:00 2001 From: Liping Zhang Date: Sun, 19 Mar 2017 22:35:59 +0800 Subject: [PATCH 331/546] netfilter: nfnl_cthelper: fix incorrect helper->expect_class_max [ Upstream commit ae5c682113f9f94cc5e76f92cf041ee624c173ee ] The helper->expect_class_max must be set to the total number of expect_policy minus 1, since we will use the statement "if (class > helper->expect_class_max)" to validate the CTA_EXPECT_CLASS attr in ctnetlink_alloc_expect. So for compatibility, set the helper->expect_class_max to the NFCTH_POLICY_SET_NUM attr's value minus 1. Also: it's invalid when the NFCTH_POLICY_SET_NUM attr's value is zero. 1. this will result "expect_policy = kzalloc(0, GFP_KERNEL);"; 2. we cannot set the helper->expect_class_max to a proper value. So if nla_get_be32(tb[NFCTH_POLICY_SET_NUM]) is zero, report -EINVAL to the userspace. Signed-off-by: Liping Zhang Signed-off-by: Pablo Neira Ayuso Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/netfilter/nfnetlink_cthelper.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c index 54330fb5efaf..6d10002d23f8 100644 --- a/net/netfilter/nfnetlink_cthelper.c +++ b/net/netfilter/nfnetlink_cthelper.c @@ -161,6 +161,7 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, int i, ret; struct nf_conntrack_expect_policy *expect_policy; struct nlattr *tb[NFCTH_POLICY_SET_MAX+1]; + unsigned int class_max; ret = nla_parse_nested(tb, NFCTH_POLICY_SET_MAX, attr, nfnl_cthelper_expect_policy_set); @@ -170,19 +171,18 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, if (!tb[NFCTH_POLICY_SET_NUM]) return -EINVAL; - helper->expect_class_max = - ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); - - if (helper->expect_class_max != 0 && - helper->expect_class_max > NF_CT_MAX_EXPECT_CLASSES) + class_max = ntohl(nla_get_be32(tb[NFCTH_POLICY_SET_NUM])); + if (class_max == 0) + return -EINVAL; + if (class_max > NF_CT_MAX_EXPECT_CLASSES) return -EOVERFLOW; expect_policy = kzalloc(sizeof(struct nf_conntrack_expect_policy) * - helper->expect_class_max, GFP_KERNEL); + class_max, GFP_KERNEL); if (expect_policy == NULL) return -ENOMEM; - for (i=0; iexpect_class_max; i++) { + for (i = 0; i < class_max; i++) { if (!tb[NFCTH_POLICY_SET+i]) goto err; @@ -191,6 +191,8 @@ nfnl_cthelper_parse_expect_policy(struct nf_conntrack_helper *helper, if (ret < 0) goto err; } + + helper->expect_class_max = class_max - 1; helper->expect_policy = expect_policy; return 0; err: @@ -377,10 +379,10 @@ nfnl_cthelper_dump_policy(struct sk_buff *skb, goto nla_put_failure; if (nla_put_be32(skb, NFCTH_POLICY_SET_NUM, - htonl(helper->expect_class_max))) + htonl(helper->expect_class_max + 1))) goto nla_put_failure; - for (i=0; iexpect_class_max; i++) { + for (i = 0; i < helper->expect_class_max + 1; i++) { nest_parms2 = nla_nest_start(skb, (NFCTH_POLICY_SET+i) | NLA_F_NESTED); if (nest_parms2 == NULL) From cadfa3a688d2f1f618677ddc66cb4f5cdbae6a81 Mon Sep 17 00:00:00 2001 From: Arvind Yadav Date: Tue, 14 Mar 2017 15:24:51 +0530 Subject: [PATCH 332/546] parisc: perf: Fix potential NULL pointer dereference [ Upstream commit 74e3f6e63da6c8e8246fba1689e040bc926b4a1a ] Fix potential NULL pointer dereference and clean up coding style errors (code indent, trailing whitespaces). Signed-off-by: Arvind Yadav Signed-off-by: Helge Deller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- arch/parisc/kernel/perf.c | 94 ++++++++++++++++++++------------------- 1 file changed, 49 insertions(+), 45 deletions(-) diff --git a/arch/parisc/kernel/perf.c b/arch/parisc/kernel/perf.c index 518f4f5f1f43..d63d42533133 100644 --- a/arch/parisc/kernel/perf.c +++ b/arch/parisc/kernel/perf.c @@ -39,7 +39,7 @@ * the PDC INTRIGUE calls. This is done to eliminate bugs introduced * in various PDC revisions. The code is much more maintainable * and reliable this way vs having to debug on every version of PDC - * on every box. + * on every box. */ #include @@ -195,8 +195,8 @@ static int perf_config(uint32_t *image_ptr); static int perf_release(struct inode *inode, struct file *file); static int perf_open(struct inode *inode, struct file *file); static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t *ppos); -static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, - loff_t *ppos); +static ssize_t perf_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos); static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg); static void perf_start_counters(void); static int perf_stop_counters(uint32_t *raddr); @@ -222,7 +222,7 @@ extern void perf_intrigue_disable_perf_counters (void); /* * configure: * - * Configure the cpu with a given data image. First turn off the counters, + * Configure the cpu with a given data image. First turn off the counters, * then download the image, then turn the counters back on. */ static int perf_config(uint32_t *image_ptr) @@ -234,7 +234,7 @@ static int perf_config(uint32_t *image_ptr) error = perf_stop_counters(raddr); if (error != 0) { printk("perf_config: perf_stop_counters = %ld\n", error); - return -EINVAL; + return -EINVAL; } printk("Preparing to write image\n"); @@ -242,7 +242,7 @@ printk("Preparing to write image\n"); error = perf_write_image((uint64_t *)image_ptr); if (error != 0) { printk("perf_config: DOWNLOAD = %ld\n", error); - return -EINVAL; + return -EINVAL; } printk("Preparing to start counters\n"); @@ -254,7 +254,7 @@ printk("Preparing to start counters\n"); } /* - * Open the device and initialize all of its memory. The device is only + * Open the device and initialize all of its memory. The device is only * opened once, but can be "queried" by multiple processes that know its * file descriptor. */ @@ -298,8 +298,8 @@ static ssize_t perf_read(struct file *file, char __user *buf, size_t cnt, loff_t * called on the processor that the download should happen * on. */ -static ssize_t perf_write(struct file *file, const char __user *buf, size_t count, - loff_t *ppos) +static ssize_t perf_write(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) { int err; size_t image_size; @@ -307,11 +307,11 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun uint32_t interface_type; uint32_t test; - if (perf_processor_interface == ONYX_INTF) + if (perf_processor_interface == ONYX_INTF) image_size = PCXU_IMAGE_SIZE; - else if (perf_processor_interface == CUDA_INTF) + else if (perf_processor_interface == CUDA_INTF) image_size = PCXW_IMAGE_SIZE; - else + else return -EFAULT; if (!capable(CAP_SYS_ADMIN)) @@ -331,22 +331,22 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun /* First check the machine type is correct for the requested image */ - if (((perf_processor_interface == CUDA_INTF) && - (interface_type != CUDA_INTF)) || - ((perf_processor_interface == ONYX_INTF) && - (interface_type != ONYX_INTF))) + if (((perf_processor_interface == CUDA_INTF) && + (interface_type != CUDA_INTF)) || + ((perf_processor_interface == ONYX_INTF) && + (interface_type != ONYX_INTF))) return -EINVAL; /* Next check to make sure the requested image is valid */ - if (((interface_type == CUDA_INTF) && + if (((interface_type == CUDA_INTF) && (test >= MAX_CUDA_IMAGES)) || - ((interface_type == ONYX_INTF) && - (test >= MAX_ONYX_IMAGES))) + ((interface_type == ONYX_INTF) && + (test >= MAX_ONYX_IMAGES))) return -EINVAL; /* Copy the image into the processor */ - if (interface_type == CUDA_INTF) + if (interface_type == CUDA_INTF) return perf_config(cuda_images[test]); else return perf_config(onyx_images[test]); @@ -360,7 +360,7 @@ static ssize_t perf_write(struct file *file, const char __user *buf, size_t coun static void perf_patch_images(void) { #if 0 /* FIXME!! */ -/* +/* * NOTE: this routine is VERY specific to the current TLB image. * If the image is changed, this routine might also need to be changed. */ @@ -368,9 +368,9 @@ static void perf_patch_images(void) extern void $i_dtlb_miss_2_0(); extern void PA2_0_iva(); - /* + /* * We can only use the lower 32-bits, the upper 32-bits should be 0 - * anyway given this is in the kernel + * anyway given this is in the kernel */ uint32_t itlb_addr = (uint32_t)&($i_itlb_miss_2_0); uint32_t dtlb_addr = (uint32_t)&($i_dtlb_miss_2_0); @@ -378,21 +378,21 @@ static void perf_patch_images(void) if (perf_processor_interface == ONYX_INTF) { /* clear last 2 bytes */ - onyx_images[TLBMISS][15] &= 0xffffff00; + onyx_images[TLBMISS][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[TLBMISS][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[TLBMISS][16] = (dtlb_addr << 8)&0xffffff00; onyx_images[TLBMISS][17] = itlb_addr; /* clear last 2 bytes */ - onyx_images[TLBHANDMISS][15] &= 0xffffff00; + onyx_images[TLBHANDMISS][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[TLBHANDMISS][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[TLBHANDMISS][16] = (dtlb_addr << 8)&0xffffff00; onyx_images[TLBHANDMISS][17] = itlb_addr; /* clear last 2 bytes */ - onyx_images[BIG_CPI][15] &= 0xffffff00; + onyx_images[BIG_CPI][15] &= 0xffffff00; /* set 2 bytes */ onyx_images[BIG_CPI][15] |= (0x000000ff&((dtlb_addr) >> 24)); onyx_images[BIG_CPI][16] = (dtlb_addr << 8)&0xffffff00; @@ -405,24 +405,24 @@ static void perf_patch_images(void) } else if (perf_processor_interface == CUDA_INTF) { /* Cuda interface */ - cuda_images[TLBMISS][16] = + cuda_images[TLBMISS][16] = (cuda_images[TLBMISS][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[TLBMISS][17] = + cuda_images[TLBMISS][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[TLBMISS][18] = (itlb_addr << 16)&0xffff0000; - cuda_images[TLBHANDMISS][16] = + cuda_images[TLBHANDMISS][16] = (cuda_images[TLBHANDMISS][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[TLBHANDMISS][17] = + cuda_images[TLBHANDMISS][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[TLBHANDMISS][18] = (itlb_addr << 16)&0xffff0000; - cuda_images[BIG_CPI][16] = + cuda_images[BIG_CPI][16] = (cuda_images[BIG_CPI][16]&0xffff0000) | ((dtlb_addr >> 8)&0x0000ffff); - cuda_images[BIG_CPI][17] = + cuda_images[BIG_CPI][17] = ((dtlb_addr << 24)&0xff000000) | ((itlb_addr >> 16)&0x000000ff); cuda_images[BIG_CPI][18] = (itlb_addr << 16)&0xffff0000; } else { @@ -434,7 +434,7 @@ static void perf_patch_images(void) /* * ioctl routine - * All routines effect the processor that they are executed on. Thus you + * All routines effect the processor that they are executed on. Thus you * must be running on the processor that you wish to change. */ @@ -460,7 +460,7 @@ static long perf_ioctl(struct file *file, unsigned int cmd, unsigned long arg) } /* copy out the Counters */ - if (copy_to_user((void __user *)arg, raddr, + if (copy_to_user((void __user *)arg, raddr, sizeof (raddr)) != 0) { error = -EFAULT; break; @@ -488,7 +488,7 @@ static const struct file_operations perf_fops = { .open = perf_open, .release = perf_release }; - + static struct miscdevice perf_dev = { MISC_DYNAMIC_MINOR, PA_PERF_DEV, @@ -596,7 +596,7 @@ static int perf_stop_counters(uint32_t *raddr) /* OR sticky2 (bit 1496) to counter2 bit 32 */ tmp64 |= (userbuf[23] >> 8) & 0x0000000080000000; raddr[2] = (uint32_t)tmp64; - + /* Counter3 is bits 1497 to 1528 */ tmp64 = (userbuf[23] >> 7) & 0x00000000ffffffff; /* OR sticky3 (bit 1529) to counter3 bit 32 */ @@ -618,7 +618,7 @@ static int perf_stop_counters(uint32_t *raddr) userbuf[22] = 0; userbuf[23] = 0; - /* + /* * Write back the zeroed bytes + the image given * the read was destructive. */ @@ -626,13 +626,13 @@ static int perf_stop_counters(uint32_t *raddr) } else { /* - * Read RDR-15 which contains the counters and sticky bits + * Read RDR-15 which contains the counters and sticky bits */ if (!perf_rdr_read_ubuf(15, userbuf)) { return -13; } - /* + /* * Clear out the counters */ perf_rdr_clear(15); @@ -645,7 +645,7 @@ static int perf_stop_counters(uint32_t *raddr) raddr[2] = (uint32_t)((userbuf[1] >> 32) & 0x00000000ffffffffUL); raddr[3] = (uint32_t)(userbuf[1] & 0x00000000ffffffffUL); } - + return 0; } @@ -683,7 +683,7 @@ static int perf_rdr_read_ubuf(uint32_t rdr_num, uint64_t *buffer) i = tentry->num_words; while (i--) { buffer[i] = 0; - } + } /* Check for bits an even number of 64 */ if ((xbits = width & 0x03f) != 0) { @@ -809,18 +809,22 @@ static int perf_write_image(uint64_t *memaddr) } runway = ioremap_nocache(cpu_device->hpa.start, 4096); + if (!runway) { + pr_err("perf_write_image: ioremap failed!\n"); + return -ENOMEM; + } /* Merge intrigue bits into Runway STATUS 0 */ tmp64 = __raw_readq(runway + RUNWAY_STATUS) & 0xffecfffffffffffful; - __raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), + __raw_writeq(tmp64 | (*memaddr++ & 0x0013000000000000ul), runway + RUNWAY_STATUS); - + /* Write RUNWAY DEBUG registers */ for (i = 0; i < 8; i++) { __raw_writeq(*memaddr++, runway + RUNWAY_DEBUG); } - return 0; + return 0; } /* @@ -844,7 +848,7 @@ printk("perf_rdr_write\n"); perf_rdr_shift_out_U(rdr_num, buffer[i]); } else { perf_rdr_shift_out_W(rdr_num, buffer[i]); - } + } } printk("perf_rdr_write done\n"); } From 9bcd5ceef96e71d57795c593e4dacf767f935c79 Mon Sep 17 00:00:00 2001 From: Oleksandr Tyshchenko Date: Mon, 27 Feb 2017 14:30:25 +0200 Subject: [PATCH 333/546] iommu/io-pgtable-arm: Check for leaf entry before dereferencing it [ Upstream commit ed46e66cc1b3d684042f92dfa2ab15ee917b4cac ] Do a check for already installed leaf entry at the current level before dereferencing it in order to avoid walking the page table down with wrong pointer to the next level. Signed-off-by: Oleksandr Tyshchenko CC: Will Deacon CC: Robin Murphy Signed-off-by: Will Deacon Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/io-pgtable-arm.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/iommu/io-pgtable-arm.c b/drivers/iommu/io-pgtable-arm.c index dad768caa9c5..18751b1dfd3d 100644 --- a/drivers/iommu/io-pgtable-arm.c +++ b/drivers/iommu/io-pgtable-arm.c @@ -335,8 +335,12 @@ static int __arm_lpae_map(struct arm_lpae_io_pgtable *data, unsigned long iova, if (cfg->quirks & IO_PGTABLE_QUIRK_ARM_NS) pte |= ARM_LPAE_PTE_NSTABLE; __arm_lpae_set_pte(ptep, pte, cfg); - } else { + } else if (!iopte_leaf(pte, lvl)) { cptep = iopte_deref(pte, data); + } else { + /* We require an unmap first */ + WARN_ON(!selftest_running); + return -EEXIST; } /* Rinse, repeat */ From 13099ee9c7d54b0a25f6c8397675aed99e9cfa45 Mon Sep 17 00:00:00 2001 From: Zhu Yanjun Date: Tue, 7 Mar 2017 02:48:36 -0500 Subject: [PATCH 334/546] rds: ib: add error handle [ Upstream commit 3b12f73a5c2977153f28a224392fd4729b50d1dc ] In the function rds_ib_setup_qp, the error handle is missing. When some error occurs, it is possible that memory leak occurs. As such, error handle is added. Cc: Joe Jin Reviewed-by: Junxiao Bi Reviewed-by: Guanglei Li Signed-off-by: Zhu Yanjun Acked-by: Santosh Shilimkar Signed-off-by: David S. Miller Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- net/rds/ib_cm.c | 47 ++++++++++++++++++++++++++++++++++++----------- 1 file changed, 36 insertions(+), 11 deletions(-) diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index da5a7fb98c77..a6f5b3d21571 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -381,7 +381,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = PTR_ERR(ic->i_send_cq); ic->i_send_cq = NULL; rdsdebug("ib_create_cq send failed: %d\n", ret); - goto out; + goto rds_ibdev_out; } cq_attr.cqe = ic->i_recv_ring.w_nr; @@ -392,19 +392,19 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = PTR_ERR(ic->i_recv_cq); ic->i_recv_cq = NULL; rdsdebug("ib_create_cq recv failed: %d\n", ret); - goto out; + goto send_cq_out; } ret = ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP); if (ret) { rdsdebug("ib_req_notify_cq send failed: %d\n", ret); - goto out; + goto recv_cq_out; } ret = ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED); if (ret) { rdsdebug("ib_req_notify_cq recv failed: %d\n", ret); - goto out; + goto recv_cq_out; } /* XXX negotiate max send/recv with remote? */ @@ -428,7 +428,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr); if (ret) { rdsdebug("rdma_create_qp failed: %d\n", ret); - goto out; + goto recv_cq_out; } ic->i_send_hdrs = ib_dma_alloc_coherent(dev, @@ -438,7 +438,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_send_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent send failed\n"); - goto out; + goto qp_out; } ic->i_recv_hdrs = ib_dma_alloc_coherent(dev, @@ -448,7 +448,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_recv_hdrs) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent recv failed\n"); - goto out; + goto send_hdrs_dma_out; } ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header), @@ -456,7 +456,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_ack) { ret = -ENOMEM; rdsdebug("ib_dma_alloc_coherent ack failed\n"); - goto out; + goto recv_hdrs_dma_out; } ic->i_sends = vzalloc_node(ic->i_send_ring.w_nr * sizeof(struct rds_ib_send_work), @@ -464,7 +464,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_sends) { ret = -ENOMEM; rdsdebug("send allocation failed\n"); - goto out; + goto ack_dma_out; } ic->i_recvs = vzalloc_node(ic->i_recv_ring.w_nr * sizeof(struct rds_ib_recv_work), @@ -472,7 +472,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn) if (!ic->i_recvs) { ret = -ENOMEM; rdsdebug("recv allocation failed\n"); - goto out; + goto sends_out; } rds_ib_recv_init_ack(ic); @@ -480,8 +480,33 @@ static int rds_ib_setup_qp(struct rds_connection *conn) rdsdebug("conn %p pd %p cq %p %p\n", conn, ic->i_pd, ic->i_send_cq, ic->i_recv_cq); -out: + return ret; + +sends_out: + vfree(ic->i_sends); +ack_dma_out: + ib_dma_free_coherent(dev, sizeof(struct rds_header), + ic->i_ack, ic->i_ack_dma); +recv_hdrs_dma_out: + ib_dma_free_coherent(dev, ic->i_recv_ring.w_nr * + sizeof(struct rds_header), + ic->i_recv_hdrs, ic->i_recv_hdrs_dma); +send_hdrs_dma_out: + ib_dma_free_coherent(dev, ic->i_send_ring.w_nr * + sizeof(struct rds_header), + ic->i_send_hdrs, ic->i_send_hdrs_dma); +qp_out: + rdma_destroy_qp(ic->i_cm_id); +recv_cq_out: + if (!ib_destroy_cq(ic->i_recv_cq)) + ic->i_recv_cq = NULL; +send_cq_out: + if (!ib_destroy_cq(ic->i_send_cq)) + ic->i_send_cq = NULL; +rds_ibdev_out: + rds_ib_remove_conn(rds_ibdev, conn); rds_ib_dev_put(rds_ibdev); + return ret; } From cb07496eab4335c4fd0d90c1cb78f1e85e937ebb Mon Sep 17 00:00:00 2001 From: Shaohua Li Date: Thu, 23 Feb 2017 12:26:41 -0800 Subject: [PATCH 335/546] md/raid10: submit bio directly to replacement disk [ Upstream commit 6d399783e9d4e9bd44931501948059d24ad96ff8 ] Commit 57c67df(md/raid10: submit IO from originating thread instead of md thread) submits bio directly for normal disks but not for replacement disks. There is no point we shouldn't do this for replacement disks. Cc: NeilBrown Signed-off-by: Shaohua Li Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/md/raid10.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c index e5ee4e9e0ea5..a8a86d450d76 100644 --- a/drivers/md/raid10.c +++ b/drivers/md/raid10.c @@ -1414,11 +1414,24 @@ static void __make_request(struct mddev *mddev, struct bio *bio) mbio->bi_private = r10_bio; atomic_inc(&r10_bio->remaining); + + cb = blk_check_plugged(raid10_unplug, mddev, + sizeof(*plug)); + if (cb) + plug = container_of(cb, struct raid10_plug_cb, + cb); + else + plug = NULL; spin_lock_irqsave(&conf->device_lock, flags); - bio_list_add(&conf->pending_bio_list, mbio); - conf->pending_count++; + if (plug) { + bio_list_add(&plug->pending, mbio); + plug->pending_cnt++; + } else { + bio_list_add(&conf->pending_bio_list, mbio); + conf->pending_count++; + } spin_unlock_irqrestore(&conf->device_lock, flags); - if (!mddev_check_plugged(mddev)) + if (!plug) md_wakeup_thread(mddev->thread); } } From 088b9a41b605079f253b99f4bba868eda89bc9fa Mon Sep 17 00:00:00 2001 From: Heiner Kallweit Date: Tue, 7 Mar 2017 21:06:38 +0100 Subject: [PATCH 336/546] i2c: meson: fix wrong variable usage in meson_i2c_put_data [ Upstream commit 3b0277f198ac928f323c42e180680d2f79aa980d ] Most likely a copy & paste error. Signed-off-by: Heiner Kallweit Acked-by: Jerome Brunet Signed-off-by: Wolfram Sang Fixes: 30021e3707a7 ("i2c: add support for Amlogic Meson I2C controller") Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/i2c/busses/i2c-meson.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/i2c/busses/i2c-meson.c b/drivers/i2c/busses/i2c-meson.c index 71d3929adf54..8d65f33af5da 100644 --- a/drivers/i2c/busses/i2c-meson.c +++ b/drivers/i2c/busses/i2c-meson.c @@ -175,7 +175,7 @@ static void meson_i2c_put_data(struct meson_i2c *i2c, char *buf, int len) wdata1 |= *buf++ << ((i - 4) * 8); writel(wdata0, i2c->regs + REG_TOK_WDATA0); - writel(wdata0, i2c->regs + REG_TOK_WDATA1); + writel(wdata1, i2c->regs + REG_TOK_WDATA1); dev_dbg(i2c->dev, "%s: data %08x %08x len %d\n", __func__, wdata0, wdata1, len); From 0185496a115dbdee8b2428541949217039222d84 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 6 Mar 2017 11:58:20 -0800 Subject: [PATCH 337/546] xfs: remove kmem_zalloc_greedy [ Upstream commit 08b005f1333154ae5b404ca28766e0ffb9f1c150 ] The sole remaining caller of kmem_zalloc_greedy is bulkstat, which uses it to grab 1-4 pages for staging of inobt records. The infinite loop in the greedy allocation function is causing hangs[1] in generic/269, so just get rid of the greedy allocator in favor of kmem_zalloc_large. This makes bulkstat somewhat more likely to ENOMEM if there's really no pages to spare, but eliminates a source of hangs. [1] http://lkml.kernel.org/r/20170301044634.rgidgdqqiiwsmfpj%40XZHOUW.usersys.redhat.com Signed-off-by: Darrick J. Wong Reviewed-by: Christoph Hellwig Signed-off-by: Greg Kroah-Hartman --- fs/xfs/kmem.c | 18 ------------------ fs/xfs/kmem.h | 2 -- fs/xfs/xfs_itable.c | 6 ++---- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/fs/xfs/kmem.c b/fs/xfs/kmem.c index 686ba6fb20dd..8067364c602f 100644 --- a/fs/xfs/kmem.c +++ b/fs/xfs/kmem.c @@ -24,24 +24,6 @@ #include "kmem.h" #include "xfs_message.h" -/* - * Greedy allocation. May fail and may return vmalloced memory. - */ -void * -kmem_zalloc_greedy(size_t *size, size_t minsize, size_t maxsize) -{ - void *ptr; - size_t kmsize = maxsize; - - while (!(ptr = vzalloc(kmsize))) { - if ((kmsize >>= 1) <= minsize) - kmsize = minsize; - } - if (ptr) - *size = kmsize; - return ptr; -} - void * kmem_alloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/kmem.h b/fs/xfs/kmem.h index cc6b768fc068..ae45f77ce33b 100644 --- a/fs/xfs/kmem.h +++ b/fs/xfs/kmem.h @@ -69,8 +69,6 @@ static inline void kmem_free(const void *ptr) } -extern void *kmem_zalloc_greedy(size_t *, size_t, size_t); - static inline void * kmem_zalloc(size_t size, xfs_km_flags_t flags) { diff --git a/fs/xfs/xfs_itable.c b/fs/xfs/xfs_itable.c index 930ebd86beba..99a4891c00ab 100644 --- a/fs/xfs/xfs_itable.c +++ b/fs/xfs/xfs_itable.c @@ -351,7 +351,6 @@ xfs_bulkstat( xfs_agino_t agino; /* inode # in allocation group */ xfs_agnumber_t agno; /* allocation group number */ xfs_btree_cur_t *cur; /* btree cursor for ialloc btree */ - size_t irbsize; /* size of irec buffer in bytes */ xfs_inobt_rec_incore_t *irbuf; /* start of irec buffer */ int nirbuf; /* size of irbuf */ int ubcount; /* size of user's buffer */ @@ -378,11 +377,10 @@ xfs_bulkstat( *ubcountp = 0; *done = 0; - irbuf = kmem_zalloc_greedy(&irbsize, PAGE_SIZE, PAGE_SIZE * 4); + irbuf = kmem_zalloc_large(PAGE_SIZE * 4, KM_SLEEP); if (!irbuf) return -ENOMEM; - - nirbuf = irbsize / sizeof(*irbuf); + nirbuf = (PAGE_SIZE * 4) / sizeof(*irbuf); /* * Loop over the allocation groups, starting from the last From f6c8c71cc9011061b6223b2ae43d87fa5a34be52 Mon Sep 17 00:00:00 2001 From: Gwendal Grignou Date: Fri, 3 Mar 2017 09:00:09 -0800 Subject: [PATCH 338/546] libata: transport: Remove circular dependency at free time [ Upstream commit d85fc67dd11e9a32966140677d4d6429ca540b25 ] Without this patch, failed probe would not free resources like irq. ata port tdev object currently hold a reference to the ata port object. Therefore the ata port object release function will not get called until the ata_tport_release is called. But that would never happen, releasing the last reference of ata port dev is done by scsi_host_release, which is called by ata_host_release when the ata port object is released. The ata device objects actually do not need to explicitly hold a reference to their real counterpart, given the transport objects are the children of these objects and device_add() is call for each child. We know the parent will not be deleted until we call the child's device_del(). Reported-by: Matthew Whitehead Tested-by: Matthew Whitehead Suggested-by: Tejun Heo Signed-off-by: Gwendal Grignou Signed-off-by: Tejun Heo Signed-off-by: Sasha Levin Signed-off-by: Greg Kroah-Hartman --- drivers/ata/libata-transport.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/ata/libata-transport.c b/drivers/ata/libata-transport.c index e2d94972962d..7aa10c200ecb 100644 --- a/drivers/ata/libata-transport.c +++ b/drivers/ata/libata-transport.c @@ -224,7 +224,6 @@ static DECLARE_TRANSPORT_CLASS(ata_port_class, static void ata_tport_release(struct device *dev) { - put_device(dev->parent); } /** @@ -284,7 +283,7 @@ int ata_tport_add(struct device *parent, device_initialize(dev); dev->type = &ata_port_type; - dev->parent = get_device(parent); + dev->parent = parent; dev->release = ata_tport_release; dev_set_name(dev, "ata%d", ap->print_id); transport_setup_device(dev); @@ -348,7 +347,6 @@ static DECLARE_TRANSPORT_CLASS(ata_link_class, static void ata_tlink_release(struct device *dev) { - put_device(dev->parent); } /** @@ -410,7 +408,7 @@ int ata_tlink_add(struct ata_link *link) int error; device_initialize(dev); - dev->parent = get_device(&ap->tdev); + dev->parent = &ap->tdev; dev->release = ata_tlink_release; if (ata_is_host_link(link)) dev_set_name(dev, "link%d", ap->print_id); @@ -588,7 +586,6 @@ static DECLARE_TRANSPORT_CLASS(ata_dev_class, static void ata_tdev_release(struct device *dev) { - put_device(dev->parent); } /** @@ -661,7 +658,7 @@ static int ata_tdev_add(struct ata_device *ata_dev) int error; device_initialize(dev); - dev->parent = get_device(&link->tdev); + dev->parent = &link->tdev; dev->release = ata_tdev_release; if (ata_is_host_link(link)) dev_set_name(dev, "dev%d.%d", ap->print_id,ata_dev->devno); From 13af23e0181223aa674567fd01102e1cc9a3a56c Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Wed, 20 Apr 2016 11:20:27 +0100 Subject: [PATCH 339/546] drivers: firmware: psci: drop duplicate const from psci_of_match commit 1d2d8de44a6c20af262b4c3d3b93ef7ec3c5488e upstream. This is to fix below sparse warning: drivers/firmware/psci.c:mmm:nn: warning: duplicate const Signed-off-by: Jisheng Zhang Signed-off-by: Lorenzo Pieralisi Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- drivers/firmware/psci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/firmware/psci.c b/drivers/firmware/psci.c index d24f35d74b27..ae70d2485ca1 100644 --- a/drivers/firmware/psci.c +++ b/drivers/firmware/psci.c @@ -424,7 +424,7 @@ static int __init psci_0_1_init(struct device_node *np) return err; } -static const struct of_device_id const psci_of_match[] __initconst = { +static const struct of_device_id psci_of_match[] __initconst = { { .compatible = "arm,psci", .data = psci_0_1_init}, { .compatible = "arm,psci-0.2", .data = psci_0_2_init}, { .compatible = "arm,psci-1.0", .data = psci_0_2_init}, From 7cad91f22d5edbf92489d3ae4f022e6a20c1ec86 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 14 Mar 2017 13:18:45 +0100 Subject: [PATCH 340/546] IB/qib: fix false-postive maybe-uninitialized warning commit f6aafac184a3e46e919769dd4faa8bf0dc436534 upstream. aarch64-linux-gcc-7 complains about code it doesn't fully understand: drivers/infiniband/hw/qib/qib_iba7322.c: In function 'qib_7322_txchk_change': include/asm-generic/bitops/non-atomic.h:105:35: error: 'shadow' may be used uninitialized in this function [-Werror=maybe-uninitialized] The code is right, and despite trying hard, I could not come up with a version that I liked better than just adding a fake initialization here to shut up the warning. Fixes: f931551bafe1 ("IB/qib: Add new qib driver for QLogic PCIe InfiniBand adapters") Signed-off-by: Arnd Bergmann Acked-by: Ira Weiny Signed-off-by: Doug Ledford Signed-off-by: Greg Kroah-Hartman --- drivers/infiniband/hw/qib/qib_iba7322.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/infiniband/hw/qib/qib_iba7322.c b/drivers/infiniband/hw/qib/qib_iba7322.c index 6c8ff10101c0..77cc77ba998f 100644 --- a/drivers/infiniband/hw/qib/qib_iba7322.c +++ b/drivers/infiniband/hw/qib/qib_iba7322.c @@ -7097,7 +7097,7 @@ static void qib_7322_txchk_change(struct qib_devdata *dd, u32 start, unsigned long flags; while (wait) { - unsigned long shadow; + unsigned long shadow = 0; int cstart, previ = -1; /* From d32ee7026081bc43313dc8f7aac8bbf93206e6a5 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 11 May 2017 13:50:16 +0200 Subject: [PATCH 341/546] ARM: remove duplicate 'const' annotations' MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 0527873b29b077fc8e656acd63e1866b429fef55 upstream. gcc-7 warns about some declarations that are more 'const' than necessary: arch/arm/mach-at91/pm.c:338:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const ramc_ids[] __initconst = { arch/arm/mach-bcm/bcm_kona_smc.c:36:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const bcm_kona_smc_ids[] __initconst = { arch/arm/mach-spear/time.c:207:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const timer_of_match[] __initconst = { arch/arm/mach-omap2/prm_common.c:714:34: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct of_device_id const omap_prcm_dt_match_table[] __initconst = { arch/arm/mach-omap2/vc.c:562:35: error: duplicate 'const' declaration specifier [-Werror=duplicate-decl-specifier] static const struct i2c_init_data const omap4_i2c_timing_data[] __initconst = { The ones in arch/arm were apparently all introduced accidentally by one commit that correctly marked a lot of variables as __initconst. Fixes: 19c233b79d1a ("ARM: appropriate __init annotation for const data") Acked-by: Alexandre Belloni Acked-by: Tony Lindgren Acked-by: Nicolas Pitre Acked-by: Florian Fainelli Acked-by: Viresh Kumar Acked-by: Krzysztof Hałasa Signed-off-by: Arnd Bergmann Signed-off-by: Greg Kroah-Hartman --- arch/arm/mach-at91/pm.c | 2 +- arch/arm/mach-bcm/bcm_kona_smc.c | 2 +- arch/arm/mach-cns3xxx/core.c | 2 +- arch/arm/mach-omap2/prm_common.c | 2 +- arch/arm/mach-omap2/vc.c | 2 +- arch/arm/mach-spear/time.c | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/arch/arm/mach-at91/pm.c b/arch/arm/mach-at91/pm.c index d687f860a2da..84eefbc2b4f9 100644 --- a/arch/arm/mach-at91/pm.c +++ b/arch/arm/mach-at91/pm.c @@ -332,7 +332,7 @@ static void at91sam9_sdram_standby(void) at91_ramc_write(1, AT91_SDRAMC_LPR, saved_lpr1); } -static const struct of_device_id const ramc_ids[] __initconst = { +static const struct of_device_id ramc_ids[] __initconst = { { .compatible = "atmel,at91rm9200-sdramc", .data = at91rm9200_standby }, { .compatible = "atmel,at91sam9260-sdramc", .data = at91sam9_sdram_standby }, { .compatible = "atmel,at91sam9g45-ddramc", .data = at91_ddr_standby }, diff --git a/arch/arm/mach-bcm/bcm_kona_smc.c b/arch/arm/mach-bcm/bcm_kona_smc.c index cf3f8658f0e5..a55a7ecf146a 100644 --- a/arch/arm/mach-bcm/bcm_kona_smc.c +++ b/arch/arm/mach-bcm/bcm_kona_smc.c @@ -33,7 +33,7 @@ struct bcm_kona_smc_data { unsigned result; }; -static const struct of_device_id const bcm_kona_smc_ids[] __initconst = { +static const struct of_device_id bcm_kona_smc_ids[] __initconst = { {.compatible = "brcm,kona-smc"}, {.compatible = "bcm,kona-smc"}, /* deprecated name */ {}, diff --git a/arch/arm/mach-cns3xxx/core.c b/arch/arm/mach-cns3xxx/core.c index 9b1dc223d8d3..e17a0e025f62 100644 --- a/arch/arm/mach-cns3xxx/core.c +++ b/arch/arm/mach-cns3xxx/core.c @@ -346,7 +346,7 @@ static struct usb_ohci_pdata cns3xxx_usb_ohci_pdata = { .power_off = csn3xxx_usb_power_off, }; -static const struct of_dev_auxdata const cns3xxx_auxdata[] __initconst = { +static const struct of_dev_auxdata cns3xxx_auxdata[] __initconst = { { "intel,usb-ehci", CNS3XXX_USB_BASE, "ehci-platform", &cns3xxx_usb_ehci_pdata }, { "intel,usb-ohci", CNS3XXX_USB_OHCI_BASE, "ohci-platform", &cns3xxx_usb_ohci_pdata }, { "cavium,cns3420-ahci", CNS3XXX_SATA2_BASE, "ahci", NULL }, diff --git a/arch/arm/mach-omap2/prm_common.c b/arch/arm/mach-omap2/prm_common.c index 3fc2cbe52113..0ce4548ef7f0 100644 --- a/arch/arm/mach-omap2/prm_common.c +++ b/arch/arm/mach-omap2/prm_common.c @@ -706,7 +706,7 @@ static struct omap_prcm_init_data scrm_data __initdata = { }; #endif -static const struct of_device_id const omap_prcm_dt_match_table[] __initconst = { +static const struct of_device_id omap_prcm_dt_match_table[] __initconst = { #ifdef CONFIG_SOC_AM33XX { .compatible = "ti,am3-prcm", .data = &am3_prm_data }, #endif diff --git a/arch/arm/mach-omap2/vc.c b/arch/arm/mach-omap2/vc.c index 2028167fff31..d76b1e5eb8ba 100644 --- a/arch/arm/mach-omap2/vc.c +++ b/arch/arm/mach-omap2/vc.c @@ -559,7 +559,7 @@ struct i2c_init_data { u8 hsscll_12; }; -static const struct i2c_init_data const omap4_i2c_timing_data[] __initconst = { +static const struct i2c_init_data omap4_i2c_timing_data[] __initconst = { { .load = 50, .loadbits = 0x3, diff --git a/arch/arm/mach-spear/time.c b/arch/arm/mach-spear/time.c index 9ccffc1d0f28..aaaa6781b9fe 100644 --- a/arch/arm/mach-spear/time.c +++ b/arch/arm/mach-spear/time.c @@ -204,7 +204,7 @@ static void __init spear_clockevent_init(int irq) setup_irq(irq, &spear_timer_irq); } -static const struct of_device_id const timer_of_match[] __initconst = { +static const struct of_device_id timer_of_match[] __initconst = { { .compatible = "st,spear-timer", }, { }, }; From 2b2bfb537be44253eb9d2dc9a0f959fdc3102427 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 23 Mar 2017 16:15:55 +0100 Subject: [PATCH 342/546] ALSA: au88x0: avoid theoretical uninitialized access commit 13f99ebdd602ebdafb909e15ec6ffb1e34690167 upstream. The latest gcc-7.0.1 snapshot points out that we if nr_ch is zero, we never initialize some variables: sound/pci/au88x0/au88x0_core.c: In function 'vortex_adb_allocroute': sound/pci/au88x0/au88x0_core.c:2304:68: error: 'mix[0]' may be used uninitialized in this function [-Werror=maybe-uninitialized] sound/pci/au88x0/au88x0_core.c:2305:58: error: 'src[0]' may be used uninitialized in this function [-Werror=maybe-uninitialized] I assume this can never happen in practice, but adding a check here doesn't hurt either and avoids the warning. The code has been unchanged since the start of git history. Signed-off-by: Arnd Bergmann Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/pci/au88x0/au88x0_core.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/sound/pci/au88x0/au88x0_core.c b/sound/pci/au88x0/au88x0_core.c index d3125c169684..065a69cf6118 100644 --- a/sound/pci/au88x0/au88x0_core.c +++ b/sound/pci/au88x0/au88x0_core.c @@ -2279,6 +2279,9 @@ vortex_adb_allocroute(vortex_t *vortex, int dma, int nr_ch, int dir, } else { int src[2], mix[2]; + if (nr_ch < 1) + return -EINVAL; + /* Get SRC and MIXER hardware resources. */ for (i = 0; i < nr_ch; i++) { if ((mix[i] = From 2536c20e82852dc0eb0eb5d4f09593de72445be3 Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Thu, 2 Feb 2017 12:51:28 -0200 Subject: [PATCH 343/546] ttpci: address stringop overflow warning commit 69d3973af1acd4c0989ec8218c05f12d303cd7cf upstream. gcc-7.0.1 warns about old code in ttpci: In file included from drivers/media/pci/ttpci/av7110.c:63:0: In function 'irdebi.isra.2', inlined from 'start_debi_dma' at drivers/media/pci/ttpci/av7110.c:376:3, inlined from 'gpioirq' at drivers/media/pci/ttpci/av7110.c:659:3: drivers/media/pci/ttpci/av7110_hw.h:406:3: warning: 'memcpy': specified size between 18446744071562067968 and 18446744073709551615 exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=] memcpy(av7110->debi_virt, (char *) &res, count); In function 'irdebi.isra.2', inlined from 'start_debi_dma' at drivers/media/pci/ttpci/av7110.c:376:3, inlined from 'gpioirq' at drivers/media/pci/ttpci/av7110.c:668:3: drivers/media/pci/ttpci/av7110_hw.h:406:3: warning: 'memcpy': specified size between 18446744071562067968 and 18446744073709551615 exceeds maximum object size 9223372036854775807 [-Wstringop-overflow=] memcpy(av7110->debi_virt, (char *) &res, count); Apparently, 'count' can be negative here, which will then get turned into a giant size argument for memcpy. Changing the sizes to 'unsigned int' instead seems safe as we already check for maximum sizes, and it also simplifies the code a bit. Signed-off-by: Arnd Bergmann Signed-off-by: Mauro Carvalho Chehab Signed-off-by: Greg Kroah-Hartman --- drivers/media/pci/ttpci/av7110_hw.c | 8 ++++---- drivers/media/pci/ttpci/av7110_hw.h | 12 ++++++------ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/drivers/media/pci/ttpci/av7110_hw.c b/drivers/media/pci/ttpci/av7110_hw.c index 300bd3c94738..0992bb0e207e 100644 --- a/drivers/media/pci/ttpci/av7110_hw.c +++ b/drivers/media/pci/ttpci/av7110_hw.c @@ -56,11 +56,11 @@ by Nathan Laredo */ int av7110_debiwrite(struct av7110 *av7110, u32 config, - int addr, u32 val, int count) + int addr, u32 val, unsigned int count) { struct saa7146_dev *dev = av7110->dev; - if (count <= 0 || count > 32764) { + if (count > 32764) { printk("%s: invalid count %d\n", __func__, count); return -1; } @@ -78,12 +78,12 @@ int av7110_debiwrite(struct av7110 *av7110, u32 config, return 0; } -u32 av7110_debiread(struct av7110 *av7110, u32 config, int addr, int count) +u32 av7110_debiread(struct av7110 *av7110, u32 config, int addr, unsigned int count) { struct saa7146_dev *dev = av7110->dev; u32 result = 0; - if (count > 32764 || count <= 0) { + if (count > 32764) { printk("%s: invalid count %d\n", __func__, count); return 0; } diff --git a/drivers/media/pci/ttpci/av7110_hw.h b/drivers/media/pci/ttpci/av7110_hw.h index 1634aba5cb84..ccb148059406 100644 --- a/drivers/media/pci/ttpci/av7110_hw.h +++ b/drivers/media/pci/ttpci/av7110_hw.h @@ -377,14 +377,14 @@ extern int av7110_fw_request(struct av7110 *av7110, u16 *request_buf, /* DEBI (saa7146 data extension bus interface) access */ extern int av7110_debiwrite(struct av7110 *av7110, u32 config, - int addr, u32 val, int count); + int addr, u32 val, unsigned int count); extern u32 av7110_debiread(struct av7110 *av7110, u32 config, - int addr, int count); + int addr, unsigned int count); /* DEBI during interrupt */ /* single word writes */ -static inline void iwdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline void iwdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { av7110_debiwrite(av7110, config, addr, val, count); } @@ -397,7 +397,7 @@ static inline void mwdebi(struct av7110 *av7110, u32 config, int addr, av7110_debiwrite(av7110, config, addr, 0, count); } -static inline u32 irdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline u32 irdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { u32 res; @@ -408,7 +408,7 @@ static inline u32 irdebi(struct av7110 *av7110, u32 config, int addr, u32 val, i } /* DEBI outside interrupts, only for count <= 4! */ -static inline void wdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline void wdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { unsigned long flags; @@ -417,7 +417,7 @@ static inline void wdebi(struct av7110 *av7110, u32 config, int addr, u32 val, i spin_unlock_irqrestore(&av7110->debilock, flags); } -static inline u32 rdebi(struct av7110 *av7110, u32 config, int addr, u32 val, int count) +static inline u32 rdebi(struct av7110 *av7110, u32 config, int addr, u32 val, unsigned int count) { unsigned long flags; u32 res; From c030c36a88cdc54a5d657c0a2ee630ba495d5538 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 8 Oct 2017 10:24:24 +0200 Subject: [PATCH 344/546] Linux 4.4.91 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index ca5aaaf4aef7..c1db50ef7fb5 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 90 +SUBLEVEL = 91 EXTRAVERSION = NAME = Blurry Fish Butt From c2eb312f3137d85a715af3f9009246b98e7ba993 Mon Sep 17 00:00:00 2001 From: David Eccher Date: Fri, 11 Dec 2015 22:13:55 +0100 Subject: [PATCH 345/546] usb: gadget: inode.c: fix unbalanced spin_lock in ep0_write commit b7bd98b7db9fc8fe19da1a5ff0215311c6b95e46 upstream. Fix bad unlock balance: ep0_write enter with the locks locked from inode.c:1769, hence it must exit with spinlock held to avoid double unlock in dev_config. Signed-off-by: David Eccher Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 43ce2cfcdb4d..2edb6948c552 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1140,10 +1140,9 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) dev->gadget->ep0, dev->req, GFP_KERNEL); } + spin_lock_irq(&dev->lock); if (retval < 0) { - spin_lock_irq (&dev->lock); clean_req (dev->gadget->ep0, dev->req); - spin_unlock_irq (&dev->lock); } else retval = len; From d20fff0b09d9c74584ced0221c8a5eabf7e1423c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:23:58 -0400 Subject: [PATCH 346/546] USB: gadgetfs: Fix crash caused by inadequate synchronization commit 520b72fc64debf8a86c3853b8e486aa5982188f0 upstream. The gadgetfs driver (drivers/usb/gadget/legacy/inode.c) was written before the UDC and composite frameworks were adopted; it is a legacy driver. As such, it expects that once bound to a UDC controller, it will not be unbound until it unregisters itself. However, the UDC framework does unbind function drivers while they are still registered. When this happens, it can cause the gadgetfs driver to misbehave or crash. For example, userspace can cause a crash by opening the device file and doing an ioctl call before setting up a configuration (found by Andrey Konovalov using the syzkaller fuzzer). This patch adds checks and synchronization to prevent these bad behaviors. It adds a udc_usage counter that the driver increments at times when it is using a gadget interface without holding the private spinlock. The unbind routine waits for this counter to go to 0 before returning, thereby ensuring that the UDC is no longer in use. The patch also adds a check in the dev_ioctl() routine to make sure the driver is bound to a UDC before dereferencing the gadget pointer, and it makes destroy_ep_files() synchronize with the endpoint I/O routines, to prevent the user from accessing an endpoint data structure after it has been removed. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 41 +++++++++++++++++++++++++++---- 1 file changed, 36 insertions(+), 5 deletions(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 2edb6948c552..0c940953a8e3 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -27,7 +27,7 @@ #include #include #include - +#include #include #include @@ -116,6 +116,7 @@ enum ep0_state { struct dev_data { spinlock_t lock; atomic_t count; + int udc_usage; enum ep0_state state; /* P: lock */ struct usb_gadgetfs_event event [N_EVENT]; unsigned ev_next; @@ -512,9 +513,9 @@ static void ep_aio_complete(struct usb_ep *ep, struct usb_request *req) INIT_WORK(&priv->work, ep_user_copy_worker); schedule_work(&priv->work); } - spin_unlock(&epdata->dev->lock); usb_ep_free_request(ep, req); + spin_unlock(&epdata->dev->lock); put_ep(epdata); } @@ -938,9 +939,11 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) struct usb_request *req = dev->req; if ((retval = setup_req (ep, req, 0)) == 0) { + ++dev->udc_usage; spin_unlock_irq (&dev->lock); retval = usb_ep_queue (ep, req, GFP_KERNEL); spin_lock_irq (&dev->lock); + --dev->udc_usage; } dev->state = STATE_DEV_CONNECTED; @@ -1130,6 +1133,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) retval = setup_req (dev->gadget->ep0, dev->req, len); if (retval == 0) { dev->state = STATE_DEV_CONNECTED; + ++dev->udc_usage; spin_unlock_irq (&dev->lock); if (copy_from_user (dev->req->buf, buf, len)) retval = -EFAULT; @@ -1141,6 +1145,7 @@ ep0_write (struct file *fd, const char __user *buf, size_t len, loff_t *ptr) GFP_KERNEL); } spin_lock_irq(&dev->lock); + --dev->udc_usage; if (retval < 0) { clean_req (dev->gadget->ep0, dev->req); } else @@ -1239,9 +1244,21 @@ static long dev_ioctl (struct file *fd, unsigned code, unsigned long value) struct usb_gadget *gadget = dev->gadget; long ret = -ENOTTY; - if (gadget->ops->ioctl) + spin_lock_irq(&dev->lock); + if (dev->state == STATE_DEV_OPENED || + dev->state == STATE_DEV_UNBOUND) { + /* Not bound to a UDC */ + } else if (gadget->ops->ioctl) { + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); + ret = gadget->ops->ioctl (gadget, code, value); + spin_lock_irq(&dev->lock); + --dev->udc_usage; + } + spin_unlock_irq(&dev->lock); + return ret; } @@ -1459,10 +1476,12 @@ gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) if (value < 0) break; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, dev->req, GFP_KERNEL); spin_lock (&dev->lock); + --dev->udc_usage; if (value < 0) { clean_req (gadget->ep0, dev->req); break; @@ -1486,8 +1505,12 @@ gadgetfs_setup (struct usb_gadget *gadget, const struct usb_ctrlrequest *ctrl) req->length = value; req->zero = value < w_length; + ++dev->udc_usage; spin_unlock (&dev->lock); value = usb_ep_queue (gadget->ep0, req, GFP_KERNEL); + spin_lock(&dev->lock); + --dev->udc_usage; + spin_unlock(&dev->lock); if (value < 0) { DBG (dev, "ep_queue --> %d\n", value); req->status = 0; @@ -1514,21 +1537,24 @@ static void destroy_ep_files (struct dev_data *dev) /* break link to FS */ ep = list_first_entry (&dev->epfiles, struct ep_data, epfiles); list_del_init (&ep->epfiles); + spin_unlock_irq (&dev->lock); + dentry = ep->dentry; ep->dentry = NULL; parent = d_inode(dentry->d_parent); /* break link to controller */ + mutex_lock(&ep->lock); if (ep->state == STATE_EP_ENABLED) (void) usb_ep_disable (ep->ep); ep->state = STATE_EP_UNBOUND; usb_ep_free_request (ep->ep, ep->req); ep->ep = NULL; + mutex_unlock(&ep->lock); + wake_up (&ep->wait); put_ep (ep); - spin_unlock_irq (&dev->lock); - /* break link to dcache */ mutex_lock (&parent->i_mutex); d_delete (dentry); @@ -1599,6 +1625,11 @@ gadgetfs_unbind (struct usb_gadget *gadget) spin_lock_irq (&dev->lock); dev->state = STATE_DEV_UNBOUND; + while (dev->udc_usage > 0) { + spin_unlock_irq(&dev->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dev->lock); + } spin_unlock_irq (&dev->lock); destroy_ep_files (dev); From f72264e79ae74c845f9af8e2fdda86075afd0331 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 16:12:01 -0400 Subject: [PATCH 347/546] USB: gadgetfs: fix copy_to_user while holding spinlock commit 6e76c01e71551cb221c1f3deacb9dcd9a7346784 upstream. The gadgetfs driver as a long-outstanding FIXME, regarding a call of copy_to_user() made while holding a spinlock. This patch fixes the issue by dropping the spinlock and using the dev->udc_usage mechanism introduced by another recent patch to guard against status changes while the lock isn't held. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Acked-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/inode.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 0c940953a8e3..b6df47aa25af 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -985,11 +985,14 @@ ep0_read (struct file *fd, char __user *buf, size_t len, loff_t *ptr) retval = -EIO; else { len = min (len, (size_t)dev->req->actual); -// FIXME don't call this with the spinlock held ... + ++dev->udc_usage; + spin_unlock_irq(&dev->lock); if (copy_to_user (buf, dev->req->buf, len)) retval = -EFAULT; else retval = len; + spin_lock_irq(&dev->lock); + --dev->udc_usage; clean_req (dev->gadget->ep0, dev->req); /* NOTE userspace can't yet choose to stall */ } From 86377bf330898f0db850b65b371c2e4843cadf3c Mon Sep 17 00:00:00 2001 From: Nicolas Ferre Date: Thu, 31 Aug 2017 14:51:40 +0200 Subject: [PATCH 348/546] usb: gadget: udc: atmel: set vbus irqflags explicitly commit 6baeda120d90aa637b08f7604de104ab00ce9126 upstream. The driver triggers actions on both edges of the vbus signal. The former PIO controller was triggering IRQs on both falling and rising edges by default. Newer PIO controller don't, so it's better to set it explicitly to IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING. Without this patch we may trigger the connection with host but only on some bouncing signal conditions and thus lose connecting events. Acked-by: Ludovic Desroches Signed-off-by: Nicolas Ferre Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/atmel_usba_udc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index f92f5aff0dd5..585cb8734f50 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -28,6 +28,8 @@ #include #include "atmel_usba_udc.h" +#define USBA_VBUS_IRQFLAGS (IRQF_ONESHOT \ + | IRQF_TRIGGER_FALLING | IRQF_TRIGGER_RISING) #ifdef CONFIG_USB_GADGET_DEBUG_FS #include @@ -2185,7 +2187,7 @@ static int usba_udc_probe(struct platform_device *pdev) IRQ_NOAUTOEN); ret = devm_request_threaded_irq(&pdev->dev, gpio_to_irq(udc->vbus_pin), NULL, - usba_vbus_irq_thread, IRQF_ONESHOT, + usba_vbus_irq_thread, USBA_VBUS_IRQFLAGS, "atmel_usba_udc", udc); if (ret) { udc->vbus_pin = -ENODEV; From e85bd5be6088e33624b5303c621a143f7bfdea78 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 15:59:30 -0400 Subject: [PATCH 349/546] usb-storage: unusual_devs entry to fix write-access regression for Seagate external drives commit 113f6eb6d50cfa5e2a1cdcf1678b12661fa272ab upstream. Kris Lindgren reports that without the NO_WP_DETECT flag, his Seagate external disk drive fails all write accesses. This regresssion dates back approximately to the start of the 4.x kernel releases. Signed-off-by: Alan Stern Reported-by: Kris Lindgren Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/unusual_devs.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/usb/storage/unusual_devs.h b/drivers/usb/storage/unusual_devs.h index 640a2e2ec04d..fb96755550ec 100644 --- a/drivers/usb/storage/unusual_devs.h +++ b/drivers/usb/storage/unusual_devs.h @@ -1379,6 +1379,13 @@ UNUSUAL_DEV( 0x0bc2, 0x3010, 0x0000, 0x0000, USB_SC_DEVICE, USB_PR_DEVICE, NULL, US_FL_SANE_SENSE ), +/* Reported by Kris Lindgren */ +UNUSUAL_DEV( 0x0bc2, 0x3332, 0x0000, 0x9999, + "Seagate", + "External", + USB_SC_DEVICE, USB_PR_DEVICE, NULL, + US_FL_NO_WP_DETECT ), + UNUSUAL_DEV( 0x0d49, 0x7310, 0x0000, 0x9999, "Maxtor", "USB to SATA", From a7131ed818058cdfaf89565f9afa0626abc71ae9 Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:12 +0900 Subject: [PATCH 350/546] usb: renesas_usbhs: fix the BCLR setting condition for non-DCP pipe commit 6124607acc88fffeaadf3aacfeb3cc1304c87387 upstream. This patch fixes an issue that the driver sets the BCLR bit of {C,Dn}FIFOCTR register to 1 even when it's non-DCP pipe and the FRDY bit of {C,Dn}FIFOCTR register is set to 1. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index 36e5b5c530bd..77897416b6a8 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -285,11 +285,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_fifo *fifo) { struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); + int ret = 0; if (!usbhs_pipe_is_dcp(pipe)) - usbhsf_fifo_barrier(priv, fifo); + ret = usbhsf_fifo_barrier(priv, fifo); - usbhs_write(priv, fifo->ctr, BCLR); + /* + * if non-DCP pipe, this driver should set BCLR when + * usbhsf_fifo_barrier() returns 0. + */ + if (!ret) + usbhs_write(priv, fifo->ctr, BCLR); } static int usbhsf_fifo_rcv_len(struct usbhs_priv *priv, From ccc6a475800da4223665ab1d41084115db2a6bae Mon Sep 17 00:00:00 2001 From: Yoshihiro Shimoda Date: Wed, 27 Sep 2017 18:47:13 +0900 Subject: [PATCH 351/546] usb: renesas_usbhs: fix usbhsf_fifo_clear() for RX direction commit 0a2ce62b61f2c76d0213edf4e37aaf54a8ddf295 upstream. This patch fixes an issue that the usbhsf_fifo_clear() is possible to cause 10 msec delay if the pipe is RX direction and empty because the FRDY bit will never be set to 1 in such case. Fixes: e8d548d54968 ("usb: renesas_usbhs: fifo became independent from pipe.") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 13 +++++++++++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index 77897416b6a8..d95cd1a72b66 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -287,8 +287,17 @@ static void usbhsf_fifo_clear(struct usbhs_pipe *pipe, struct usbhs_priv *priv = usbhs_pipe_to_priv(pipe); int ret = 0; - if (!usbhs_pipe_is_dcp(pipe)) - ret = usbhsf_fifo_barrier(priv, fifo); + if (!usbhs_pipe_is_dcp(pipe)) { + /* + * This driver checks the pipe condition first to avoid -EBUSY + * from usbhsf_fifo_barrier() with about 10 msec delay in + * the interrupt handler if the pipe is RX direction and empty. + */ + if (usbhs_pipe_is_dir_in(pipe)) + ret = usbhs_pipe_is_accessible(pipe); + if (!ret) + ret = usbhsf_fifo_barrier(priv, fifo); + } /* * if non-DCP pipe, this driver should set BCLR when From 46c7b1fa4911a859a82575e3ffb55b34a89a222d Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Fri, 22 Sep 2017 16:18:53 +0200 Subject: [PATCH 352/546] ALSA: usb-audio: Check out-of-bounds access by corrupted buffer descriptor commit bfc81a8bc18e3c4ba0cbaa7666ff76be2f998991 upstream. When a USB-audio device receives a maliciously adjusted or corrupted buffer descriptor, the USB-audio driver may access an out-of-bounce value at its parser. This was detected by syzkaller, something like: BUG: KASAN: slab-out-of-bounds in usb_audio_probe+0x27b2/0x2ab0 Read of size 1 at addr ffff88006b83a9e8 by task kworker/0:1/24 CPU: 0 PID: 24 Comm: kworker/0:1 Not tainted 4.14.0-rc1-42251-gebb2c2437d80 #224 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x22f/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x19/0x20 mm/kasan/report.c:427 snd_usb_create_streams sound/usb/card.c:248 usb_audio_probe+0x27b2/0x2ab0 sound/usb/card.c:605 usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_set_configuration+0x104e/0x1870 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4903 hub_port_connect_change drivers/usb/core/hub.c:5009 port_event drivers/usb/core/hub.c:5115 hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195 process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119 worker_thread+0x221/0x1850 kernel/workqueue.c:2253 kthread+0x3a1/0x470 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 This patch adds the checks of out-of-bounce accesses at appropriate places and bails out when it goes out of the given buffer. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/card.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/sound/usb/card.c b/sound/usb/card.c index a1cbaa5f7fc9..83336bb6333e 100644 --- a/sound/usb/card.c +++ b/sound/usb/card.c @@ -217,6 +217,7 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) struct usb_interface_descriptor *altsd; void *control_header; int i, protocol; + int rest_bytes; /* find audiocontrol interface */ host_iface = &usb_ifnum_to_if(dev, ctrlif)->altsetting[0]; @@ -231,6 +232,15 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) return -EINVAL; } + rest_bytes = (void *)(host_iface->extra + host_iface->extralen) - + control_header; + + /* just to be sure -- this shouldn't hit at all */ + if (rest_bytes <= 0) { + dev_err(&dev->dev, "invalid control header\n"); + return -EINVAL; + } + switch (protocol) { default: dev_warn(&dev->dev, @@ -241,11 +251,21 @@ static int snd_usb_create_streams(struct snd_usb_audio *chip, int ctrlif) case UAC_VERSION_1: { struct uac1_ac_header_descriptor *h1 = control_header; + if (rest_bytes < sizeof(*h1)) { + dev_err(&dev->dev, "too short v1 buffer descriptor\n"); + return -EINVAL; + } + if (!h1->bInCollection) { dev_info(&dev->dev, "skipping empty audio interface (v1)\n"); return -EINVAL; } + if (rest_bytes < h1->bLength) { + dev_err(&dev->dev, "invalid buffer length (v1)\n"); + return -EINVAL; + } + if (h1->bLength < sizeof(*h1) + h1->bInCollection) { dev_err(&dev->dev, "invalid UAC_HEADER (v1)\n"); return -EINVAL; From da358168126b24206c162a5bd647a2d13377dccf Mon Sep 17 00:00:00 2001 From: Jim Dickerson Date: Mon, 18 Sep 2017 17:39:14 +0300 Subject: [PATCH 353/546] usb: pci-quirks.c: Corrected timeout values used in handshake commit 114ec3a6f9096d211a4aff4277793ba969a62c73 upstream. Servers were emitting failed handoff messages but were not waiting the full 1 second as designated in section 4.22.1 of the eXtensible Host Controller Interface specifications. The handshake was using wrong units so calls were made with milliseconds not microseconds. Comments referenced 5 seconds not 1 second as in specs. The wrong units were also corrected in a second handshake call. Signed-off-by: Jim Dickerson Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/pci-quirks.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/usb/host/pci-quirks.c b/drivers/usb/host/pci-quirks.c index 1fc6f478a02c..89e9494c3245 100644 --- a/drivers/usb/host/pci-quirks.c +++ b/drivers/usb/host/pci-quirks.c @@ -969,7 +969,7 @@ EXPORT_SYMBOL_GPL(usb_disable_xhci_ports); * * Takes care of the handoff between the Pre-OS (i.e. BIOS) and the OS. * It signals to the BIOS that the OS wants control of the host controller, - * and then waits 5 seconds for the BIOS to hand over control. + * and then waits 1 second for the BIOS to hand over control. * If we timeout, assume the BIOS is broken and take control anyway. */ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) @@ -1015,9 +1015,9 @@ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) if (val & XHCI_HC_BIOS_OWNED) { writel(val | XHCI_HC_OS_OWNED, base + ext_cap_offset); - /* Wait for 5 seconds with 10 microsecond polling interval */ + /* Wait for 1 second with 10 microsecond polling interval */ timeout = handshake(base + ext_cap_offset, XHCI_HC_BIOS_OWNED, - 0, 5000, 10); + 0, 1000000, 10); /* Assume a buggy BIOS and take HC ownership anyway */ if (timeout) { @@ -1046,7 +1046,7 @@ static void quirk_usb_handoff_xhci(struct pci_dev *pdev) * operational or runtime registers. Wait 5 seconds and no more. */ timeout = handshake(op_reg_base + XHCI_STS_OFFSET, XHCI_STS_CNR, 0, - 5000, 10); + 5000000, 10); /* Assume a buggy HC and start HC initialization anyway */ if (timeout) { val = readl(op_reg_base + XHCI_STS_OFFSET); From d25a65e03f1815130753031f17648802f8f42407 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:22 -0400 Subject: [PATCH 354/546] USB: dummy-hcd: fix connection failures (wrong speed) commit fe659bcc9b173bcfdd958ce2aec75e47651e74e1 upstream. The dummy-hcd UDC driver is not careful about the way it handles connection speeds. It ignores the module parameter that is supposed to govern the maximum connection speed and it doesn't set the HCD flags properly for the case where it ends up running at full speed. The result is that in many cases, gadget enumeration over dummy-hcd fails because the bMaxPacketSize byte in the device descriptor is set incorrectly. For example, the default settings call for a high-speed connection, but the maxpacket value for ep0 ends up being set for a Super-Speed connection. This patch fixes the problem by initializing the gadget's max_speed and the HCD flags correctly. Signed-off-by: Alan Stern Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 64f404a1a072..ae120a699467 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -1032,7 +1032,12 @@ static int dummy_udc_probe(struct platform_device *pdev) memzero_explicit(&dum->gadget, sizeof(struct usb_gadget)); dum->gadget.name = gadget_name; dum->gadget.ops = &dummy_ops; - dum->gadget.max_speed = USB_SPEED_SUPER; + if (mod_data.is_super_speed) + dum->gadget.max_speed = USB_SPEED_SUPER; + else if (mod_data.is_high_speed) + dum->gadget.max_speed = USB_SPEED_HIGH; + else + dum->gadget.max_speed = USB_SPEED_FULL; dum->gadget.dev.parent = &pdev->dev; init_dummy_udc_hw(dum); @@ -2564,8 +2569,6 @@ static struct hc_driver dummy_hcd = { .product_desc = "Dummy host controller", .hcd_priv_size = sizeof(struct dummy_hcd), - .flags = HCD_USB3 | HCD_SHARED, - .reset = dummy_setup, .start = dummy_start, .stop = dummy_stop, @@ -2594,8 +2597,12 @@ static int dummy_hcd_probe(struct platform_device *pdev) dev_info(&pdev->dev, "%s, driver " DRIVER_VERSION "\n", driver_desc); dum = *((void **)dev_get_platdata(&pdev->dev)); - if (!mod_data.is_super_speed) + if (mod_data.is_super_speed) + dummy_hcd.flags = HCD_USB3 | HCD_SHARED; + else if (mod_data.is_high_speed) dummy_hcd.flags = HCD_USB2; + else + dummy_hcd.flags = HCD_USB11; hs_hcd = usb_create_hcd(&dummy_hcd, &pdev->dev, dev_name(&pdev->dev)); if (!hs_hcd) return -ENOMEM; From d1a0787b5a244418d340901753aa49783ab53a90 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:40 -0400 Subject: [PATCH 355/546] USB: dummy-hcd: fix infinite-loop resubmission bug commit 0173a68bfb0ad1c72a6ee39cc485aa2c97540b98 upstream. The dummy-hcd HCD/UDC emulator tries not to do too much work during each timer interrupt. But it doesn't try very hard; currently all it does is limit the total amount of bulk data transferred. Other transfer types aren't limited, and URBs that transfer no data (because of an error, perhaps) don't count toward the limit, even though on a real USB bus they would consume at least a minimum overhead. This means it's possible to get the driver stuck in an infinite loop, for example, if the host class driver resubmits an URB every time it completes (which is common for interrupt URBs). Each time the URB is resubmitted it gets added to the end of the pending-URBs list, and dummy-hcd doesn't stop until that list is empty. Andrey Konovalov was able to trigger this failure mode using the syzkaller fuzzer. This patch fixes the infinite-loop problem by restricting the URBs handled during each timer interrupt to those that were already on the pending list when the interrupt routine started. Newly added URBs won't be processed until the next timer interrupt. The problem of properly accounting for non-bulk bandwidth (as well as packet and transaction overhead) is not addressed here. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index ae120a699467..7212b68851d9 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -237,6 +237,8 @@ struct dummy_hcd { struct usb_device *udev; struct list_head urbp_list; + struct urbp *next_frame_urbp; + u32 stream_en_ep; u8 num_stream[30 / 2]; @@ -1246,6 +1248,8 @@ static int dummy_urb_enqueue( list_add_tail(&urbp->urbp_list, &dum_hcd->urbp_list); urb->hcpriv = urbp; + if (!dum_hcd->next_frame_urbp) + dum_hcd->next_frame_urbp = urbp; if (usb_pipetype(urb->pipe) == PIPE_CONTROL) urb->error_count = 1; /* mark as a new urb */ @@ -1763,6 +1767,7 @@ static void dummy_timer(unsigned long _dum_hcd) spin_unlock_irqrestore(&dum->lock, flags); return; } + dum_hcd->next_frame_urbp = NULL; for (i = 0; i < DUMMY_ENDPOINTS; i++) { if (!ep_info[i].name) @@ -1779,6 +1784,10 @@ static void dummy_timer(unsigned long _dum_hcd) int type; int status = -EINPROGRESS; + /* stop when we reach URBs queued after the timer interrupt */ + if (urbp == dum_hcd->next_frame_urbp) + break; + urb = urbp->urb; if (urb->unlinked) goto return_urb; From e84b4a008365b7edbd842a063ae28d040a98db25 Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Tue, 26 Sep 2017 15:15:49 -0400 Subject: [PATCH 356/546] USB: dummy-hcd: Fix erroneous synchronization change commit 7dbd8f4cabd96db5a50513de9d83a8105a5ffc81 upstream. A recent change to the synchronization in dummy-hcd was incorrect. The issue was that dummy_udc_stop() contained no locking and therefore could race with various gadget driver callbacks, and the fix was to add locking and issue the callbacks with the private spinlock held. UDC drivers aren't supposed to do this. Gadget driver callback routines are allowed to invoke functions in the UDC driver, and these functions will generally try to acquire the private spinlock. This would deadlock the driver. The correct solution is to drop the spinlock before issuing callbacks, and avoid races by emulating the synchronize_irq() call that all real UDC drivers must perform in their ->udc_stop() routines after disabling interrupts. This involves adding a flag to dummy-hcd's private structure to keep track of whether interrupts are supposed to be enabled, and adding a counter to keep track of ongoing callbacks so that dummy_udc_stop() can wait for them all to finish. A real UDC driver won't receive disconnect, reset, suspend, resume, or setup events once it has disabled interrupts. dummy-hcd will receive them but won't try to issue any gadget driver callbacks, which should be just as good. Signed-off-by: Alan Stern Fixes: f16443a034c7 ("USB: gadgetfs, dummy-hcd, net2280: fix locking for callbacks") Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 32 ++++++++++++++++++++++++++++-- 1 file changed, 30 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index 7212b68851d9..db645c38055d 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -255,11 +255,13 @@ struct dummy { */ struct dummy_ep ep[DUMMY_ENDPOINTS]; int address; + int callback_usage; struct usb_gadget gadget; struct usb_gadget_driver *driver; struct dummy_request fifo_req; u8 fifo_buf[FIFO_SIZE]; u16 devstatus; + unsigned ints_enabled:1; unsigned udc_suspended:1; unsigned pullup:1; @@ -442,18 +444,27 @@ static void set_link_state(struct dummy_hcd *dum_hcd) (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ - if (dum->driver && (disconnect || reset)) { + if (dum->ints_enabled && (disconnect || reset)) { stop_activity(dum); + ++dum->callback_usage; + spin_unlock(&dum->lock); if (reset) usb_gadget_udc_reset(&dum->gadget, dum->driver); else dum->driver->disconnect(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } - } else if (dum_hcd->active != dum_hcd->old_active) { + } else if (dum_hcd->active != dum_hcd->old_active && + dum->ints_enabled) { + ++dum->callback_usage; + spin_unlock(&dum->lock); if (dum_hcd->old_active && dum->driver->suspend) dum->driver->suspend(&dum->gadget); else if (!dum_hcd->old_active && dum->driver->resume) dum->driver->resume(&dum->gadget); + spin_lock(&dum->lock); + --dum->callback_usage; } dum_hcd->old_status = dum_hcd->port_status; @@ -969,8 +980,11 @@ static int dummy_udc_start(struct usb_gadget *g, * can't enumerate without help from the driver we're binding. */ + spin_lock_irq(&dum->lock); dum->devstatus = 0; dum->driver = driver; + dum->ints_enabled = 1; + spin_unlock_irq(&dum->lock); return 0; } @@ -981,6 +995,16 @@ static int dummy_udc_stop(struct usb_gadget *g) struct dummy *dum = dum_hcd->dum; spin_lock_irq(&dum->lock); + dum->ints_enabled = 0; + stop_activity(dum); + + /* emulate synchronize_irq(): wait for callbacks to finish */ + while (dum->callback_usage > 0) { + spin_unlock_irq(&dum->lock); + usleep_range(1000, 2000); + spin_lock_irq(&dum->lock); + } + dum->driver = NULL; spin_unlock_irq(&dum->lock); @@ -1526,6 +1550,8 @@ static struct dummy_ep *find_endpoint(struct dummy *dum, u8 address) if (!is_active((dum->gadget.speed == USB_SPEED_SUPER ? dum->ss_hcd : dum->hs_hcd))) return NULL; + if (!dum->ints_enabled) + return NULL; if ((address & ~USB_DIR_IN) == 0) return &dum->ep[0]; for (i = 1; i < DUMMY_ENDPOINTS; i++) { @@ -1867,10 +1893,12 @@ static void dummy_timer(unsigned long _dum_hcd) * until setup() returns; no reentrancy issues etc. */ if (value > 0) { + ++dum->callback_usage; spin_unlock(&dum->lock); value = dum->driver->setup(&dum->gadget, &setup); spin_lock(&dum->lock); + --dum->callback_usage; if (value >= 0) { /* no delays (max 64KB data stage) */ From b74a45450f80a56a3aca515dd147bd95b18394bf Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 22 Sep 2017 23:43:46 +0300 Subject: [PATCH 357/546] USB: devio: Don't corrupt user memory commit fa1ed74eb1c233be6131ec92df21ab46499a15b6 upstream. The user buffer has "uurb->buffer_length" bytes. If the kernel has more information than that, we should truncate it instead of writing past the end of the user's buffer. I added a WARN_ONCE() to help the user debug the issue. Reported-by: Alan Stern Signed-off-by: Dan Carpenter Acked-by: Alan Stern Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/devio.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/drivers/usb/core/devio.c b/drivers/usb/core/devio.c index 873ba02d59e6..bd9419213d06 100644 --- a/drivers/usb/core/devio.c +++ b/drivers/usb/core/devio.c @@ -1417,7 +1417,11 @@ static int proc_do_submiturb(struct usb_dev_state *ps, struct usbdevfs_urb *uurb totlen += isopkt[u].length; } u *= sizeof(struct usb_iso_packet_descriptor); - uurb->buffer_length = totlen; + if (totlen <= uurb->buffer_length) + uurb->buffer_length = totlen; + else + WARN_ONCE(1, "uurb->buffer_length is too short %d vs %d", + totlen, uurb->buffer_length); break; default: From 2efab2c3a3ae18a6fe33d958230c480f7268e235 Mon Sep 17 00:00:00 2001 From: Li Jun Date: Fri, 14 Apr 2017 19:12:07 +0800 Subject: [PATCH 358/546] usb: gadget: mass_storage: set msg_registered after msg registered commit 8e55d30322c6a0ef746c256a1beda9c73ecb27a6 upstream. If there is no UDC available, the msg register will fail and this flag will not be set, but the driver is already added into pending driver list, then the module removal modprobe -r can not remove the driver from the pending list. Signed-off-by: Li Jun Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/legacy/mass_storage.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c index 99aa22c81770..dd3fbad566fa 100644 --- a/drivers/usb/gadget/legacy/mass_storage.c +++ b/drivers/usb/gadget/legacy/mass_storage.c @@ -210,7 +210,6 @@ static int msg_bind(struct usb_composite_dev *cdev) usb_composite_overwrite_options(cdev, &coverwrite); dev_info(&cdev->gadget->dev, DRIVER_DESC ", version: " DRIVER_VERSION "\n"); - set_bit(0, &msg_registered); return 0; fail_otg_desc: @@ -257,7 +256,12 @@ MODULE_LICENSE("GPL"); static int __init msg_init(void) { - return usb_composite_probe(&msg_driver); + int ret; + + ret = usb_composite_probe(&msg_driver); + set_bit(0, &msg_registered); + + return ret; } module_init(msg_init); From a44be3e548e444fb4890387fa9da48b98ed9ff3c Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Thu, 21 Sep 2017 13:22:00 -0400 Subject: [PATCH 359/546] USB: g_mass_storage: Fix deadlock when driver is unbound commit 1fbbb78f25d1291274f320462bf6908906f538db upstream. As a holdover from the old g_file_storage gadget, the g_mass_storage legacy gadget driver attempts to unregister itself when its main operating thread terminates (if it hasn't been unregistered already). This is not strictly necessary; it was never more than an attempt to have the gadget fail cleanly if something went wrong and the main thread was killed. However, now that the UDC core manages gadget drivers independently of UDC drivers, this scheme doesn't work any more. A simple test: modprobe dummy-hcd modprobe g-mass-storage file=... rmmod dummy-hcd ends up in a deadlock with the following backtrace: sysrq: SysRq : Show Blocked State task PC stack pid father file-storage D 0 1130 2 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_preempt_disabled+0xd/0xf __mutex_lock.isra.1+0x129/0x224 ? _raw_spin_unlock_irqrestore+0x12/0x14 __mutex_lock_slowpath+0x12/0x14 mutex_lock+0x28/0x2b usb_gadget_unregister_driver+0x29/0x9b [udc_core] usb_composite_unregister+0x10/0x12 [libcomposite] msg_cleanup+0x1d/0x20 [g_mass_storage] msg_thread_exits+0xd/0xdd7 [g_mass_storage] fsg_main_thread+0x1395/0x13d6 [usb_f_mass_storage] ? __schedule+0x573/0x58c kthread+0xd9/0xdb ? do_set_interface+0x25c/0x25c [usb_f_mass_storage] ? init_completion+0x1e/0x1e ret_from_fork+0x19/0x24 rmmod D 0 1155 683 0x00000000 Call Trace: __schedule+0x53e/0x58c schedule+0x6e/0x77 schedule_timeout+0x26/0xbc ? __schedule+0x573/0x58c do_wait_for_common+0xb3/0x128 ? usleep_range+0x81/0x81 ? wake_up_q+0x3f/0x3f wait_for_common+0x2e/0x45 wait_for_completion+0x17/0x19 fsg_common_put+0x34/0x81 [usb_f_mass_storage] fsg_free_inst+0x13/0x1e [usb_f_mass_storage] usb_put_function_instance+0x1a/0x25 [libcomposite] msg_unbind+0x2a/0x42 [g_mass_storage] __composite_unbind+0x4a/0x6f [libcomposite] composite_unbind+0x12/0x14 [libcomposite] usb_gadget_remove_driver+0x4f/0x77 [udc_core] usb_del_gadget_udc+0x52/0xcc [udc_core] dummy_udc_remove+0x27/0x2c [dummy_hcd] platform_drv_remove+0x1d/0x31 device_release_driver_internal+0xe9/0x16d device_release_driver+0x11/0x13 bus_remove_device+0xd2/0xe2 device_del+0x19f/0x221 ? selinux_capable+0x22/0x27 platform_device_del+0x21/0x63 platform_device_unregister+0x10/0x1a cleanup+0x20/0x817 [dummy_hcd] SyS_delete_module+0x10c/0x197 ? ____fput+0xd/0xf ? task_work_run+0x55/0x62 ? prepare_exit_to_usermode+0x65/0x75 do_fast_syscall_32+0x86/0xc3 entry_SYSENTER_32+0x4e/0x7c What happens is that removing the dummy-hcd driver causes the UDC core to unbind the gadget driver, which it does while holding the udc_lock mutex. The unbind routine in g_mass_storage tells the main thread to exit and waits for it to terminate. But as mentioned above, when the main thread exits it tries to unregister the mass-storage function driver. Via the composite framework this ends up calling usb_gadget_unregister_driver(), which tries to acquire the udc_lock mutex. The result is deadlock. The simplest way to fix the problem is not to be so clever: The main thread doesn't have to unregister the function driver. The side effects won't be so terrible; if the gadget is still attached to a USB host when the main thread is killed, it will appear to the host as though the gadget's firmware has crashed -- a reasonably accurate interpretation, and an all-too-common occurrence for USB mass-storage devices. In fact, the code to unregister the driver when the main thread exits is specific to g-mass-storage; it is not used when f-mass-storage is included as a function in a larger composite device. Therefore the entire mechanism responsible for this (the fsg_operations structure with its ->thread_exits method, the fsg_common_set_ops() routine, and the msg_thread_exits() callback routine) can all be eliminated. Even the msg_registered bitflag can be removed, because now the driver is unregistered in only one place rather than in two places. Signed-off-by: Alan Stern Acked-by: Felipe Balbi Acked-by: Michal Nazarewicz Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/function/f_mass_storage.c | 27 +++++--------------- drivers/usb/gadget/function/f_mass_storage.h | 14 ---------- drivers/usb/gadget/legacy/mass_storage.c | 26 +++---------------- 3 files changed, 10 insertions(+), 57 deletions(-) diff --git a/drivers/usb/gadget/function/f_mass_storage.c b/drivers/usb/gadget/function/f_mass_storage.c index a069726da72a..4dd3c7672247 100644 --- a/drivers/usb/gadget/function/f_mass_storage.c +++ b/drivers/usb/gadget/function/f_mass_storage.c @@ -306,8 +306,6 @@ struct fsg_common { struct completion thread_notifier; struct task_struct *thread_task; - /* Callback functions. */ - const struct fsg_operations *ops; /* Gadget's private data. */ void *private_data; @@ -2504,6 +2502,7 @@ static void handle_exception(struct fsg_common *common) static int fsg_main_thread(void *common_) { struct fsg_common *common = common_; + int i; /* * Allow the thread to be killed by a signal, but set the signal mask @@ -2565,21 +2564,16 @@ static int fsg_main_thread(void *common_) common->thread_task = NULL; spin_unlock_irq(&common->lock); - if (!common->ops || !common->ops->thread_exits - || common->ops->thread_exits(common) < 0) { - int i; + /* Eject media from all LUNs */ - down_write(&common->filesem); - for (i = 0; i < ARRAY_SIZE(common->luns); --i) { - struct fsg_lun *curlun = common->luns[i]; - if (!curlun || !fsg_lun_is_open(curlun)) - continue; + down_write(&common->filesem); + for (i = 0; i < ARRAY_SIZE(common->luns); i++) { + struct fsg_lun *curlun = common->luns[i]; + if (curlun && fsg_lun_is_open(curlun)) fsg_lun_close(curlun); - curlun->unit_attention_data = SS_MEDIUM_NOT_PRESENT; - } - up_write(&common->filesem); } + up_write(&common->filesem); /* Let fsg_unbind() know the thread has exited */ complete_and_exit(&common->thread_notifier, 0); @@ -2785,13 +2779,6 @@ void fsg_common_remove_luns(struct fsg_common *common) } EXPORT_SYMBOL_GPL(fsg_common_remove_luns); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops) -{ - common->ops = ops; -} -EXPORT_SYMBOL_GPL(fsg_common_set_ops); - void fsg_common_free_buffers(struct fsg_common *common) { _fsg_common_free_buffers(common->buffhds, common->fsg_num_buffers); diff --git a/drivers/usb/gadget/function/f_mass_storage.h b/drivers/usb/gadget/function/f_mass_storage.h index b6a9918eaefb..dfa2176f43c2 100644 --- a/drivers/usb/gadget/function/f_mass_storage.h +++ b/drivers/usb/gadget/function/f_mass_storage.h @@ -60,17 +60,6 @@ struct fsg_module_parameters { struct fsg_common; /* FSF callback functions */ -struct fsg_operations { - /* - * Callback function to call when thread exits. If no - * callback is set or it returns value lower then zero MSF - * will force eject all LUNs it operates on (including those - * marked as non-removable or with prevent_medium_removal flag - * set). - */ - int (*thread_exits)(struct fsg_common *common); -}; - struct fsg_lun_opts { struct config_group group; struct fsg_lun *lun; @@ -141,9 +130,6 @@ void fsg_common_remove_lun(struct fsg_lun *lun); void fsg_common_remove_luns(struct fsg_common *common); -void fsg_common_set_ops(struct fsg_common *common, - const struct fsg_operations *ops); - int fsg_common_create_lun(struct fsg_common *common, struct fsg_lun_config *cfg, unsigned int id, const char *name, const char **name_pfx); diff --git a/drivers/usb/gadget/legacy/mass_storage.c b/drivers/usb/gadget/legacy/mass_storage.c index dd3fbad566fa..b0099d7c3886 100644 --- a/drivers/usb/gadget/legacy/mass_storage.c +++ b/drivers/usb/gadget/legacy/mass_storage.c @@ -107,15 +107,6 @@ static unsigned int fsg_num_buffers = CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS; FSG_MODULE_PARAMETERS(/* no prefix */, mod_data); -static unsigned long msg_registered; -static void msg_cleanup(void); - -static int msg_thread_exits(struct fsg_common *common) -{ - msg_cleanup(); - return 0; -} - static int msg_do_config(struct usb_configuration *c) { struct fsg_opts *opts; @@ -154,9 +145,6 @@ static struct usb_configuration msg_config_driver = { static int msg_bind(struct usb_composite_dev *cdev) { - static const struct fsg_operations ops = { - .thread_exits = msg_thread_exits, - }; struct fsg_opts *opts; struct fsg_config config; int status; @@ -173,8 +161,6 @@ static int msg_bind(struct usb_composite_dev *cdev) if (status) goto fail; - fsg_common_set_ops(opts->common, &ops); - status = fsg_common_set_cdev(opts->common, cdev, config.can_stall); if (status) goto fail_set_cdev; @@ -256,18 +242,12 @@ MODULE_LICENSE("GPL"); static int __init msg_init(void) { - int ret; - - ret = usb_composite_probe(&msg_driver); - set_bit(0, &msg_registered); - - return ret; + return usb_composite_probe(&msg_driver); } module_init(msg_init); -static void msg_cleanup(void) +static void __exit msg_cleanup(void) { - if (test_and_clear_bit(0, &msg_registered)) - usb_composite_unregister(&msg_driver); + usb_composite_unregister(&msg_driver); } module_exit(msg_cleanup); From dd1f96a0a72c1f1a80c03e3ee2aa851f417edac2 Mon Sep 17 00:00:00 2001 From: Casey Schaufler Date: Tue, 19 Sep 2017 09:39:08 -0700 Subject: [PATCH 360/546] lsm: fix smack_inode_removexattr and xattr_getsecurity memleak commit 57e7ba04d422c3d41c8426380303ec9b7533ded9 upstream. security_inode_getsecurity() provides the text string value of a security attribute. It does not provide a "secctx". The code in xattr_getsecurity() that calls security_inode_getsecurity() and then calls security_release_secctx() happened to work because SElinux and Smack treat the attribute and the secctx the same way. It fails for cap_inode_getsecurity(), because that module has no secctx that ever needs releasing. It turns out that Smack is the one that's doing things wrong by not allocating memory when instructed to do so by the "alloc" parameter. The fix is simple enough. Change the security_release_secctx() to kfree() because it isn't a secctx being returned by security_inode_getsecurity(). Change Smack to allocate the string when told to do so. Note: this also fixes memory leaks for LSMs which implement inode_getsecurity but not release_secctx, such as capabilities. Signed-off-by: Casey Schaufler Reported-by: Konstantin Khlebnikov Signed-off-by: James Morris Signed-off-by: Greg Kroah-Hartman --- fs/xattr.c | 2 +- security/smack/smack_lsm.c | 59 +++++++++++++++++--------------------- 2 files changed, 28 insertions(+), 33 deletions(-) diff --git a/fs/xattr.c b/fs/xattr.c index f0da9d24e9ca..76f01bf4b048 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -163,7 +163,7 @@ xattr_getsecurity(struct inode *inode, const char *name, void *value, } memcpy(value, buffer, len); out: - security_release_secctx(buffer, len); + kfree(buffer); out_noalloc: return len; } diff --git a/security/smack/smack_lsm.c b/security/smack/smack_lsm.c index 7c57c7fcf5a2..735a1a9386d6 100644 --- a/security/smack/smack_lsm.c +++ b/security/smack/smack_lsm.c @@ -1459,7 +1459,7 @@ static int smack_inode_removexattr(struct dentry *dentry, const char *name) * @inode: the object * @name: attribute name * @buffer: where to put the result - * @alloc: unused + * @alloc: duplicate memory * * Returns the size of the attribute or an error code */ @@ -1472,43 +1472,38 @@ static int smack_inode_getsecurity(const struct inode *inode, struct super_block *sbp; struct inode *ip = (struct inode *)inode; struct smack_known *isp; - int ilen; - int rc = 0; - if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) { + if (strcmp(name, XATTR_SMACK_SUFFIX) == 0) isp = smk_of_inode(inode); - ilen = strlen(isp->smk_known); - *buffer = isp->smk_known; - return ilen; + else { + /* + * The rest of the Smack xattrs are only on sockets. + */ + sbp = ip->i_sb; + if (sbp->s_magic != SOCKFS_MAGIC) + return -EOPNOTSUPP; + + sock = SOCKET_I(ip); + if (sock == NULL || sock->sk == NULL) + return -EOPNOTSUPP; + + ssp = sock->sk->sk_security; + + if (strcmp(name, XATTR_SMACK_IPIN) == 0) + isp = ssp->smk_in; + else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) + isp = ssp->smk_out; + else + return -EOPNOTSUPP; } - /* - * The rest of the Smack xattrs are only on sockets. - */ - sbp = ip->i_sb; - if (sbp->s_magic != SOCKFS_MAGIC) - return -EOPNOTSUPP; - - sock = SOCKET_I(ip); - if (sock == NULL || sock->sk == NULL) - return -EOPNOTSUPP; - - ssp = sock->sk->sk_security; - - if (strcmp(name, XATTR_SMACK_IPIN) == 0) - isp = ssp->smk_in; - else if (strcmp(name, XATTR_SMACK_IPOUT) == 0) - isp = ssp->smk_out; - else - return -EOPNOTSUPP; - - ilen = strlen(isp->smk_known); - if (rc == 0) { - *buffer = isp->smk_known; - rc = ilen; + if (alloc) { + *buffer = kstrdup(isp->smk_known, GFP_KERNEL); + if (*buffer == NULL) + return -ENOMEM; } - return rc; + return strlen(isp->smk_known); } From 8cff1556ddbc0ea8639b9f5627470fc38a58c46e Mon Sep 17 00:00:00 2001 From: Guneshwor Singh Date: Thu, 14 Sep 2017 17:49:40 +0530 Subject: [PATCH 361/546] ALSA: compress: Remove unused variable commit a931b9ce93841a5b66b709ba5a244276e345e63b upstream. Commit 04c5d5a430fc ("ALSA: compress: Embed struct device") removed the statement that used 'str' but didn't remove the variable itself. So remove it. [Adding stable to Cc since pr_debug() may refer to the uninitialized buffer -- tiwai] Fixes: 04c5d5a430fc ("ALSA: compress: Embed struct device") Signed-off-by: Guneshwor Singh Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/compress_offload.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/sound/core/compress_offload.c b/sound/core/compress_offload.c index b554d7f9e3be..6163bf3e8177 100644 --- a/sound/core/compress_offload.c +++ b/sound/core/compress_offload.c @@ -872,14 +872,13 @@ static const struct file_operations snd_compr_file_ops = { static int snd_compress_dev_register(struct snd_device *device) { int ret = -EINVAL; - char str[16]; struct snd_compr *compr; if (snd_BUG_ON(!device || !device->device_data)) return -EBADFD; compr = device->device_data; - pr_debug("reg %s for device %s, direction %d\n", str, compr->name, + pr_debug("reg device %s, direction %d\n", compr->name, compr->direction); /* register compressed device */ ret = snd_register_device(SNDRV_DEVICE_TYPE_COMPRESS, From 6d1bc9ee4c2d4cc5cb5ead3fafe5fd9d85455428 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 2 Oct 2017 14:06:43 +0200 Subject: [PATCH 362/546] ALSA: usx2y: Suppress kernel warning at page allocation failures commit 7682e399485fe19622b6fd82510b1f4551e48a25 upstream. The usx2y driver allocates the stream read/write buffers in continuous pages depending on the stream setup, and this may spew the kernel warning messages with a stack trace like: WARNING: CPU: 1 PID: 1846 at mm/page_alloc.c:3883 __alloc_pages_slowpath+0x1ef2/0x2d70 Modules linked in: CPU: 1 PID: 1846 Comm: kworker/1:2 Not tainted .... It may confuse user as if it were any serious error, although this is no fatal error and the driver handles the error case gracefully. Since the driver has already some sanity check of the given size (128 and 256 pages), it can't pass any crazy value. So it's merely page fragmentation. This patch adds __GFP_NOWARN to each caller for suppressing such kernel warnings. The original issue was spotted by syzkaller. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/usx2y/usb_stream.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/sound/usb/usx2y/usb_stream.c b/sound/usb/usx2y/usb_stream.c index bf618e1500ac..e7b934f4d837 100644 --- a/sound/usb/usx2y/usb_stream.c +++ b/sound/usb/usx2y/usb_stream.c @@ -191,7 +191,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, } pg = get_order(read_size); - sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + sk->s = (void *) __get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->s) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); goto out; @@ -211,7 +212,8 @@ struct usb_stream *usb_stream_new(struct usb_stream_kernel *sk, pg = get_order(write_size); sk->write_page = - (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO, pg); + (void *)__get_free_pages(GFP_KERNEL|__GFP_COMP|__GFP_ZERO| + __GFP_NOWARN, pg); if (!sk->write_page) { snd_printk(KERN_WARNING "couldn't __get_free_pages()\n"); usb_stream_free(sk); From 2b91a52e156910155317c27130db29712c560e4e Mon Sep 17 00:00:00 2001 From: Nicolai Stange Date: Mon, 11 Sep 2017 09:45:42 +0200 Subject: [PATCH 363/546] driver core: platform: Don't read past the end of "driver_override" buffer commit bf563b01c2895a4bfd1a29cc5abc67fe706ecffd upstream. When printing the driver_override parameter when it is 4095 and 4094 bytes long, the printing code would access invalid memory because we need count+1 bytes for printing. Reject driver_override values of these lengths in driver_override_store(). This is in close analogy to commit 4efe874aace5 ("PCI: Don't read past the end of sysfs "driver_override" buffer") from Sasha Levin. Fixes: 3d713e0e382e ("driver core: platform: add device binding path 'driver_override'") Signed-off-by: Nicolai Stange Signed-off-by: Greg Kroah-Hartman --- drivers/base/platform.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/base/platform.c b/drivers/base/platform.c index cb4ad6e98b28..065fcc4be263 100644 --- a/drivers/base/platform.c +++ b/drivers/base/platform.c @@ -809,7 +809,8 @@ static ssize_t driver_override_store(struct device *dev, struct platform_device *pdev = to_platform_device(dev); char *driver_override, *old, *cp; - if (count > PATH_MAX) + /* We need to keep extra room for a newline */ + if (count >= (PAGE_SIZE - 1)) return -EINVAL; driver_override = kstrndup(buf, count, GFP_KERNEL); From c85e9442f9e4ba8d36694f07e5129580be60e9d9 Mon Sep 17 00:00:00 2001 From: Olaf Hering Date: Thu, 21 Sep 2017 23:41:48 -0700 Subject: [PATCH 364/546] Drivers: hv: fcopy: restore correct transfer length commit 549e658a0919e355a2b2144dc380b3729bef7f3e upstream. Till recently the expected length of bytes read by the daemon did depend on the context. It was either hv_start_fcopy or hv_do_fcopy. The daemon had a buffer size of two pages, which was much larger than needed. Now the expected length of bytes read by the daemon changed slightly. For START_FILE_COPY it is still the size of hv_start_fcopy. But for WRITE_TO_FILE and the other operations it is as large as the buffer that arrived via vmbus. In case of WRITE_TO_FILE that is slightly larger than a struct hv_do_fcopy. Since the buffer in the daemon was still larger everything was fine. Currently, the daemon reads only what is actually needed. The new buffer layout is as large as a struct hv_do_fcopy, for the WRITE_TO_FILE operation. Since the kernel expects a slightly larger size, hvt_op_read will return -EINVAL because the daemon will read slightly less than expected. Address this by restoring the expected buffer size in case of WRITE_TO_FILE. Fixes: 'c7e490fc23eb ("Drivers: hv: fcopy: convert to hv_utils_transport")' Fixes: '3f2baa8a7d2e ("Tools: hv: update buffer handling in hv_fcopy_daemon")' Signed-off-by: Olaf Hering Signed-off-by: K. Y. Srinivasan Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/hv/hv_fcopy.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/hv/hv_fcopy.c b/drivers/hv/hv_fcopy.c index 1fb02dcbc500..12dcbd8226f2 100644 --- a/drivers/hv/hv_fcopy.c +++ b/drivers/hv/hv_fcopy.c @@ -155,6 +155,10 @@ static void fcopy_send_data(struct work_struct *dummy) out_src = smsg_out; break; + case WRITE_TO_FILE: + out_src = fcopy_transaction.fcopy_msg; + out_len = sizeof(struct hv_do_fcopy); + break; default: out_src = fcopy_transaction.fcopy_msg; out_len = fcopy_transaction.recv_len; From 60623d7ca38d7f1e99979ada4199bf1c4143e9f9 Mon Sep 17 00:00:00 2001 From: Alexander Shishkin Date: Tue, 19 Sep 2017 18:47:40 +0300 Subject: [PATCH 365/546] stm class: Fix a use-after-free commit fd085bb1766d6a598f53af2308374a546a49775a upstream. For reasons unknown, the stm_source removal path uses device_destroy() to kill the underlying device object. Because device_destroy() uses devt to look for the device to destroy and the fact that stm_source devices don't have one (or all have the same one), it just picks the first device in the class, which may well be the wrong one. That is, loading stm_console and stm_heartbeat and then removing both will die in dereferencing a freed object. Since this should have been device_unregister() in the first place, use it instead of device_destroy(). Signed-off-by: Alexander Shishkin Fixes: 7bd1d4093c2 ("stm class: Introduce an abstraction for System Trace Module devices") Signed-off-by: Greg Kroah-Hartman --- drivers/hwtracing/stm/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/hwtracing/stm/core.c b/drivers/hwtracing/stm/core.c index b6445d9e5453..d2dff159a471 100644 --- a/drivers/hwtracing/stm/core.c +++ b/drivers/hwtracing/stm/core.c @@ -952,7 +952,7 @@ void stm_source_unregister_device(struct stm_source_data *data) stm_source_link_drop(src); - device_destroy(&stm_source_class, src->dev.devt); + device_unregister(&src->dev); } EXPORT_SYMBOL_GPL(stm_source_unregister_device); From 87509592ecc3a31dbfd669bb49d53e33cb6b6304 Mon Sep 17 00:00:00 2001 From: Shu Wang Date: Tue, 12 Sep 2017 10:14:54 +0800 Subject: [PATCH 366/546] ftrace: Fix kmemleak in unregister_ftrace_graph commit 2b0b8499ae75df91455bbeb7491d45affc384fb0 upstream. The trampoline allocated by function tracer was overwriten by function_graph tracer, and caused a memory leak. The save_global_trampoline should have saved the previous trampoline in register_ftrace_graph() and restored it in unregister_ftrace_graph(). But as it is implemented, save_global_trampoline was only used in unregister_ftrace_graph as default value 0, and it overwrote the previous trampoline's value. Causing the previous allocated trampoline to be lost. kmmeleak backtrace: kmemleak_vmalloc+0x77/0xc0 __vmalloc_node_range+0x1b5/0x2c0 module_alloc+0x7c/0xd0 arch_ftrace_update_trampoline+0xb5/0x290 ftrace_startup+0x78/0x210 register_ftrace_function+0x8b/0xd0 function_trace_init+0x4f/0x80 tracing_set_tracer+0xe6/0x170 tracing_set_trace_write+0x90/0xd0 __vfs_write+0x37/0x170 vfs_write+0xb2/0x1b0 SyS_write+0x55/0xc0 do_syscall_64+0x67/0x180 return_from_SYSCALL_64+0x0/0x6a [ Looking further into this, I found that this was left over from when the function and function graph tracers shared the same ftrace_ops. But in commit 5f151b2401 ("ftrace: Fix function_profiler and function tracer together"), the two were separated, and the save_global_trampoline no longer was necessary (and it may have been broken back then too). -- Steven Rostedt ] Link: http://lkml.kernel.org/r/20170912021454.5976-1-shuwang@redhat.com Fixes: 5f151b2401 ("ftrace: Fix function_profiler and function tracer together") Signed-off-by: Shu Wang Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/trace/ftrace.c | 14 -------------- 1 file changed, 14 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index 38d73a6e2857..fc0051fd672d 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -4315,9 +4315,6 @@ static char ftrace_graph_buf[FTRACE_FILTER_SIZE] __initdata; static char ftrace_graph_notrace_buf[FTRACE_FILTER_SIZE] __initdata; static int ftrace_set_func(unsigned long *array, int *idx, int size, char *buffer); -static unsigned long save_global_trampoline; -static unsigned long save_global_flags; - static int __init set_graph_function(char *str) { strlcpy(ftrace_graph_buf, str, FTRACE_FILTER_SIZE); @@ -5907,17 +5904,6 @@ void unregister_ftrace_graph(void) unregister_pm_notifier(&ftrace_suspend_notifier); unregister_trace_sched_switch(ftrace_graph_probe_sched_switch, NULL); -#ifdef CONFIG_DYNAMIC_FTRACE - /* - * Function graph does not allocate the trampoline, but - * other global_ops do. We need to reset the ALLOC_TRAMP flag - * if one was used. - */ - global_ops.trampoline = save_global_trampoline; - if (save_global_flags & FTRACE_OPS_FL_ALLOC_TRAMP) - global_ops.flags |= FTRACE_OPS_FL_ALLOC_TRAMP; -#endif - out: mutex_unlock(&ftrace_lock); } From 146a9dc99025f8026b5ef6811fe0e57a631a9928 Mon Sep 17 00:00:00 2001 From: Adrian Salido Date: Fri, 8 Sep 2017 10:55:27 -0700 Subject: [PATCH 367/546] HID: i2c-hid: allocate hid buffers for real worst case commit 8320caeeffdefec3b58b9d4a7ed8e1079492fe7b upstream. The buffer allocation is not currently accounting for an extra byte for the report id. This can cause an out of bounds access in function i2c_hid_set_or_send_report() with reportID > 15. Signed-off-by: Adrian Salido Reviewed-by: Benson Leung Signed-off-by: Guenter Roeck Signed-off-by: Dmitry Torokhov Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/i2c-hid/i2c-hid.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/hid/i2c-hid/i2c-hid.c b/drivers/hid/i2c-hid/i2c-hid.c index d4d655a10df1..312aa1e33fb2 100644 --- a/drivers/hid/i2c-hid/i2c-hid.c +++ b/drivers/hid/i2c-hid/i2c-hid.c @@ -540,7 +540,8 @@ static int i2c_hid_alloc_buffers(struct i2c_hid *ihid, size_t report_size) { /* the worst case is computed from the set_report command with a * reportID > 15 and the maximum report length */ - int args_len = sizeof(__u8) + /* optional ReportID byte */ + int args_len = sizeof(__u8) + /* ReportID */ + sizeof(__u8) + /* optional ReportID byte */ sizeof(__u16) + /* data register */ sizeof(__u16) + /* size of the report */ report_size; /* report */ From fc29713fa7c78fda30855444eeab2d5ea8088762 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Tue, 15 Aug 2017 20:48:41 +0300 Subject: [PATCH 368/546] iwlwifi: add workaround to disable wide channels in 5GHz commit 01a9c948a09348950515bf2abb6113ed83e696d8 upstream. The OTP in some SKUs have erroneously allowed 40MHz and 80MHz channels in the 5.2GHz band. The firmware has been modified to not allow this in those SKUs, so the driver needs to do the same otherwise the firmware will assert when we try to use it. Signed-off-by: Luca Coelho Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/iwlwifi/iwl-nvm-parse.c | 70 ++++++++++++++++---- 1 file changed, 58 insertions(+), 12 deletions(-) diff --git a/drivers/net/wireless/iwlwifi/iwl-nvm-parse.c b/drivers/net/wireless/iwlwifi/iwl-nvm-parse.c index d82984912e04..95b82cc132e6 100644 --- a/drivers/net/wireless/iwlwifi/iwl-nvm-parse.c +++ b/drivers/net/wireless/iwlwifi/iwl-nvm-parse.c @@ -73,6 +73,7 @@ /* NVM offsets (in words) definitions */ enum wkp_nvm_offsets { /* NVM HW-Section offset (in words) definitions */ + SUBSYSTEM_ID = 0x0A, HW_ADDR = 0x15, /* NVM SW-Section offset (in words) definitions */ @@ -257,13 +258,12 @@ static u32 iwl_get_channel_flags(u8 ch_num, int ch_idx, bool is_5ghz, static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, struct iwl_nvm_data *data, const __le16 * const nvm_ch_flags, - bool lar_supported) + bool lar_supported, bool no_wide_in_5ghz) { int ch_idx; int n_channels = 0; struct ieee80211_channel *channel; u16 ch_flags; - bool is_5ghz; int num_of_ch, num_2ghz_channels; const u8 *nvm_chan; @@ -278,12 +278,20 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, } for (ch_idx = 0; ch_idx < num_of_ch; ch_idx++) { + bool is_5ghz = (ch_idx >= num_2ghz_channels); + ch_flags = __le16_to_cpup(nvm_ch_flags + ch_idx); - if (ch_idx >= num_2ghz_channels && - !data->sku_cap_band_52GHz_enable) + if (is_5ghz && !data->sku_cap_band_52GHz_enable) continue; + /* workaround to disable wide channels in 5GHz */ + if (no_wide_in_5ghz && is_5ghz) { + ch_flags &= ~(NVM_CHANNEL_40MHZ | + NVM_CHANNEL_80MHZ | + NVM_CHANNEL_160MHZ); + } + if (!lar_supported && !(ch_flags & NVM_CHANNEL_VALID)) { /* * Channels might become valid later if lar is @@ -303,8 +311,8 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, n_channels++; channel->hw_value = nvm_chan[ch_idx]; - channel->band = (ch_idx < num_2ghz_channels) ? - IEEE80211_BAND_2GHZ : IEEE80211_BAND_5GHZ; + channel->band = is_5ghz ? + IEEE80211_BAND_5GHZ : IEEE80211_BAND_2GHZ; channel->center_freq = ieee80211_channel_to_frequency( channel->hw_value, channel->band); @@ -316,7 +324,6 @@ static int iwl_init_channel_map(struct device *dev, const struct iwl_cfg *cfg, * is not used in mvm, and is used for backwards compatibility */ channel->max_power = IWL_DEFAULT_MAX_TX_POWER; - is_5ghz = channel->band == IEEE80211_BAND_5GHZ; /* don't put limitations in case we're using LAR */ if (!lar_supported) @@ -405,7 +412,8 @@ static void iwl_init_vht_hw_capab(const struct iwl_cfg *cfg, static void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg, struct iwl_nvm_data *data, const __le16 *ch_section, - u8 tx_chains, u8 rx_chains, bool lar_supported) + u8 tx_chains, u8 rx_chains, bool lar_supported, + bool no_wide_in_5ghz) { int n_channels; int n_used = 0; @@ -414,12 +422,14 @@ static void iwl_init_sbands(struct device *dev, const struct iwl_cfg *cfg, if (cfg->device_family != IWL_DEVICE_FAMILY_8000) n_channels = iwl_init_channel_map( dev, cfg, data, - &ch_section[NVM_CHANNELS], lar_supported); + &ch_section[NVM_CHANNELS], lar_supported, + no_wide_in_5ghz); else n_channels = iwl_init_channel_map( dev, cfg, data, &ch_section[NVM_CHANNELS_FAMILY_8000], - lar_supported); + lar_supported, + no_wide_in_5ghz); sband = &data->bands[IEEE80211_BAND_2GHZ]; sband->band = IEEE80211_BAND_2GHZ; @@ -582,6 +592,39 @@ static void iwl_set_hw_address_family_8000(struct device *dev, #define IWL_4165_DEVICE_ID 0x5501 +static bool +iwl_nvm_no_wide_in_5ghz(struct device *dev, const struct iwl_cfg *cfg, + const __le16 *nvm_hw) +{ + /* + * Workaround a bug in Indonesia SKUs where the regulatory in + * some 7000-family OTPs erroneously allow wide channels in + * 5GHz. To check for Indonesia, we take the SKU value from + * bits 1-4 in the subsystem ID and check if it is either 5 or + * 9. In those cases, we need to force-disable wide channels + * in 5GHz otherwise the FW will throw a sysassert when we try + * to use them. + */ + if (cfg->device_family == IWL_DEVICE_FAMILY_7000) { + /* + * Unlike the other sections in the NVM, the hw + * section uses big-endian. + */ + u16 subsystem_id = be16_to_cpup((const __be16 *)nvm_hw + + SUBSYSTEM_ID); + u8 sku = (subsystem_id & 0x1e) >> 1; + + if (sku == 5 || sku == 9) { + IWL_DEBUG_EEPROM(dev, + "disabling wide channels in 5GHz (0x%0x %d)\n", + subsystem_id, sku); + return true; + } + } + + return false; +} + struct iwl_nvm_data * iwl_parse_nvm_data(struct device *dev, const struct iwl_cfg *cfg, const __le16 *nvm_hw, const __le16 *nvm_sw, @@ -591,6 +634,7 @@ iwl_parse_nvm_data(struct device *dev, const struct iwl_cfg *cfg, u32 mac_addr0, u32 mac_addr1, u32 hw_id) { struct iwl_nvm_data *data; + bool no_wide_in_5ghz = iwl_nvm_no_wide_in_5ghz(dev, cfg, nvm_hw); u32 sku; u32 radio_cfg; u16 lar_config; @@ -657,7 +701,8 @@ iwl_parse_nvm_data(struct device *dev, const struct iwl_cfg *cfg, iwl_set_hw_address(cfg, data, nvm_hw); iwl_init_sbands(dev, cfg, data, nvm_sw, - tx_chains, rx_chains, lar_fw_supported); + tx_chains, rx_chains, lar_fw_supported, + no_wide_in_5ghz); } else { u16 lar_offset = data->nvm_version < 0xE39 ? NVM_LAR_OFFSET_FAMILY_8000_OLD : @@ -673,7 +718,8 @@ iwl_parse_nvm_data(struct device *dev, const struct iwl_cfg *cfg, iwl_init_sbands(dev, cfg, data, regulatory, tx_chains, rx_chains, - lar_fw_supported && data->lar_enabled); + lar_fw_supported && data->lar_enabled, + no_wide_in_5ghz); } data->calib_version = 255; From 9e78ac87626a92fdf71e519f71a76c6f6d4dfea3 Mon Sep 17 00:00:00 2001 From: "Martin K. Petersen" Date: Wed, 27 Sep 2017 21:38:59 -0400 Subject: [PATCH 369/546] scsi: sd: Do not override max_sectors_kb sysfs setting commit 77082ca503bed061f7fbda7cfd7c93beda967a41 upstream. A user may lower the max_sectors_kb setting in sysfs to accommodate certain workloads. Previously we would always set the max I/O size to either the block layer default or the optional preferred I/O size reported by the device. Keep the current heuristics for the initial setting of max_sectors_kb. For subsequent invocations, only update the current queue limit if it exceeds the capabilities of the hardware. Reported-by: Don Brace Reviewed-by: Martin Wilck Tested-by: Don Brace Signed-off-by: Martin K. Petersen Signed-off-by: Greg Kroah-Hartman --- drivers/scsi/sd.c | 19 ++++++++++++++----- 1 file changed, 14 insertions(+), 5 deletions(-) diff --git a/drivers/scsi/sd.c b/drivers/scsi/sd.c index 8750c86f95f9..7e1681cf287c 100644 --- a/drivers/scsi/sd.c +++ b/drivers/scsi/sd.c @@ -2878,8 +2878,6 @@ static int sd_revalidate_disk(struct gendisk *disk) sd_read_write_same(sdkp, buffer); } - sdkp->first_scan = 0; - /* * We now have all cache related info, determine how we deal * with flush requests. @@ -2894,7 +2892,7 @@ static int sd_revalidate_disk(struct gendisk *disk) q->limits.max_dev_sectors = logical_to_sectors(sdp, dev_max); /* - * Use the device's preferred I/O size for reads and writes + * Determine the device's preferred I/O size for reads and writes * unless the reported value is unreasonably small, large, or * garbage. */ @@ -2908,8 +2906,19 @@ static int sd_revalidate_disk(struct gendisk *disk) rw_max = min_not_zero(logical_to_sectors(sdp, dev_max), (sector_t)BLK_DEF_MAX_SECTORS); - /* Combine with controller limits */ - q->limits.max_sectors = min(rw_max, queue_max_hw_sectors(q)); + /* Do not exceed controller limit */ + rw_max = min(rw_max, queue_max_hw_sectors(q)); + + /* + * Only update max_sectors if previously unset or if the current value + * exceeds the capabilities of the hardware. + */ + if (sdkp->first_scan || + q->limits.max_sectors > q->limits.max_dev_sectors || + q->limits.max_sectors > q->limits.max_hw_sectors) + q->limits.max_sectors = rw_max; + + sdkp->first_scan = 0; set_capacity(disk, logical_to_sectors(sdp, sdkp->capacity)); sd_config_write_same(sdkp); From 5d9a9c3dcc1f63215b5a5b877be589974ec4f31d Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 22 Sep 2017 11:56:49 -0400 Subject: [PATCH 370/546] USB: uas: fix bug in handling of alternate settings commit 786de92b3cb26012d3d0f00ee37adf14527f35c4 upstream. The uas driver has a subtle bug in the way it handles alternate settings. The uas_find_uas_alt_setting() routine returns an altsetting value (the bAlternateSetting number in the descriptor), but uas_use_uas_driver() then treats that value as an index to the intf->altsetting array, which it isn't. Normally this doesn't cause any problems because the various alternate settings have bAlternateSetting values 0, 1, 2, ..., so the value is equal to the index in the array. But this is not guaranteed, and Andrey Konovalov used the syzkaller fuzzer with KASAN to get a slab-out-of-bounds error by violating this assumption. This patch fixes the bug by making uas_find_uas_alt_setting() return a pointer to the altsetting entry rather than either the value or the index. Pointers are less subject to misinterpretation. Signed-off-by: Alan Stern Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov CC: Oliver Neukum Signed-off-by: Greg Kroah-Hartman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/storage/uas-detect.h | 15 ++++++++------- drivers/usb/storage/uas.c | 10 +++++----- 2 files changed, 13 insertions(+), 12 deletions(-) diff --git a/drivers/usb/storage/uas-detect.h b/drivers/usb/storage/uas-detect.h index f58caa9e6a27..a155cd02bce2 100644 --- a/drivers/usb/storage/uas-detect.h +++ b/drivers/usb/storage/uas-detect.h @@ -9,7 +9,8 @@ static int uas_is_interface(struct usb_host_interface *intf) intf->desc.bInterfaceProtocol == USB_PR_UAS); } -static int uas_find_uas_alt_setting(struct usb_interface *intf) +static struct usb_host_interface *uas_find_uas_alt_setting( + struct usb_interface *intf) { int i; @@ -17,10 +18,10 @@ static int uas_find_uas_alt_setting(struct usb_interface *intf) struct usb_host_interface *alt = &intf->altsetting[i]; if (uas_is_interface(alt)) - return alt->desc.bAlternateSetting; + return alt; } - return -ENODEV; + return NULL; } static int uas_find_endpoints(struct usb_host_interface *alt, @@ -58,14 +59,14 @@ static int uas_use_uas_driver(struct usb_interface *intf, struct usb_device *udev = interface_to_usbdev(intf); struct usb_hcd *hcd = bus_to_hcd(udev->bus); unsigned long flags = id->driver_info; - int r, alt; - + struct usb_host_interface *alt; + int r; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) + if (!alt) return 0; - r = uas_find_endpoints(&intf->altsetting[alt], eps); + r = uas_find_endpoints(alt, eps); if (r < 0) return 0; diff --git a/drivers/usb/storage/uas.c b/drivers/usb/storage/uas.c index e26e32169a36..f952635ebe5f 100644 --- a/drivers/usb/storage/uas.c +++ b/drivers/usb/storage/uas.c @@ -849,14 +849,14 @@ MODULE_DEVICE_TABLE(usb, uas_usb_ids); static int uas_switch_interface(struct usb_device *udev, struct usb_interface *intf) { - int alt; + struct usb_host_interface *alt; alt = uas_find_uas_alt_setting(intf); - if (alt < 0) - return alt; + if (!alt) + return -ENODEV; - return usb_set_interface(udev, - intf->altsetting[0].desc.bInterfaceNumber, alt); + return usb_set_interface(udev, alt->desc.bInterfaceNumber, + alt->desc.bAlternateSetting); } static int uas_configure_endpoints(struct uas_dev_info *devinfo) From feab51a916ed07219dee38b898fe22bd2a98193a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 21 Sep 2017 16:58:48 +0200 Subject: [PATCH 371/546] USB: core: harden cdc_parse_cdc_header commit 2e1c42391ff2556387b3cb6308b24f6f65619feb upstream. Andrey Konovalov reported a possible out-of-bounds problem for the cdc_parse_cdc_header function. He writes: It looks like cdc_parse_cdc_header() doesn't validate buflen before accessing buffer[1], buffer[2] and so on. The only check present is while (buflen > 0). So fix this issue up by properly validating the buffer length matches what the descriptor says it is. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/net/usb/usbnet.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index c2ea4e5666fb..9710cf71054a 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -1990,6 +1990,10 @@ int cdc_parse_cdc_header(struct usb_cdc_parsed_header *hdr, elength = 1; goto next_desc; } + if ((buflen < elength) || (elength < 3)) { + dev_err(&intf->dev, "invalid descriptor buffer length\n"); + break; + } if (buffer[1] != USB_DT_CS_INTERFACE) { dev_err(&intf->dev, "skipping garbage\n"); goto next_desc; From ddcbaf853dc5e3242b8423cdeae4a180a34432cb Mon Sep 17 00:00:00 2001 From: Dmitry Fleytman Date: Tue, 5 Sep 2017 11:40:56 +0300 Subject: [PATCH 372/546] usb: Increase quirk delay for USB devices commit b2a542bbb3081dbd64acc8929c140d196664c406 upstream. Commit e0429362ab15 ("usb: Add device quirk for Logitech HD Pro Webcams C920 and C930e") introduced quirk to workaround an issue with some Logitech webcams. The workaround is introducing delay for some USB operations. According to our testing, delay introduced by original commit is not long enough and in rare cases we still see issues described by the aforementioned commit. This patch increases delays introduced by original commit. Having this patch applied we do not see those problems anymore. Signed-off-by: Dmitry Fleytman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 2 +- drivers/usb/core/hub.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index 325cbc9c35d8..adddfee0a2ef 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -818,7 +818,7 @@ int usb_get_configuration(struct usb_device *dev) } if (dev->quirks & USB_QUIRK_DELAY_INIT) - msleep(100); + msleep(200); result = usb_get_descriptor(dev, USB_DT_CONFIG, cfgno, bigbuffer, length); diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c index cdf4be3939f5..51bba58c0c3b 100644 --- a/drivers/usb/core/hub.c +++ b/drivers/usb/core/hub.c @@ -4761,7 +4761,7 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, goto loop; if (udev->quirks & USB_QUIRK_DELAY_INIT) - msleep(1000); + msleep(2000); /* consecutive bus-powered hubs aren't reliable; they can * violate the voltage drop budget. if the new child has From 13713e63bdb306f9a58d359b15edd60f34eac5ee Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 19 Sep 2017 15:07:17 +0200 Subject: [PATCH 373/546] USB: fix out-of-bounds in usb_set_configuration commit bd7a3fe770ebd8391d1c7d072ff88e9e76d063eb upstream. Andrey Konovalov reported a possible out-of-bounds problem for a USB interface association descriptor. He writes: It seems there's no proper size check of a USB_DT_INTERFACE_ASSOCIATION descriptor. It's only checked that the size is >= 2 in usb_parse_configuration(), so find_iad() might do out-of-bounds access to intf_assoc->bInterfaceCount. And he's right, we don't check for crazy descriptors of this type very well, so resolve this problem. Yet another issue found by syzkaller... Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/usb/core/config.c | 14 +++++++++++--- include/uapi/linux/usb/ch9.h | 1 + 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/drivers/usb/core/config.c b/drivers/usb/core/config.c index adddfee0a2ef..d9d048fc9082 100644 --- a/drivers/usb/core/config.c +++ b/drivers/usb/core/config.c @@ -609,15 +609,23 @@ static int usb_parse_configuration(struct usb_device *dev, int cfgidx, } else if (header->bDescriptorType == USB_DT_INTERFACE_ASSOCIATION) { + struct usb_interface_assoc_descriptor *d; + + d = (struct usb_interface_assoc_descriptor *)header; + if (d->bLength < USB_DT_INTERFACE_ASSOCIATION_SIZE) { + dev_warn(ddev, + "config %d has an invalid interface association descriptor of length %d, skipping\n", + cfgno, d->bLength); + continue; + } + if (iad_num == USB_MAXIADS) { dev_warn(ddev, "found more Interface " "Association Descriptors " "than allocated for in " "configuration %d\n", cfgno); } else { - config->intf_assoc[iad_num] = - (struct usb_interface_assoc_descriptor - *)header; + config->intf_assoc[iad_num] = d; iad_num++; } diff --git a/include/uapi/linux/usb/ch9.h b/include/uapi/linux/usb/ch9.h index 779a62aafafe..91ab75c1013c 100644 --- a/include/uapi/linux/usb/ch9.h +++ b/include/uapi/linux/usb/ch9.h @@ -717,6 +717,7 @@ struct usb_interface_assoc_descriptor { __u8 iFunction; } __attribute__ ((packed)); +#define USB_DT_INTERFACE_ASSOCIATION_SIZE 8 /*-------------------------------------------------------------------------*/ From 4590ed795f0ccb1401d515eaa5864166032a4cfb Mon Sep 17 00:00:00 2001 From: Mathias Nyman Date: Mon, 18 Sep 2017 17:39:13 +0300 Subject: [PATCH 374/546] xhci: fix finding correct bus_state structure for USB 3.1 hosts commit 5a838a13c9b4e5dd188b7a6eaeb894e9358ead0c upstream. xhci driver keeps a bus_state structure for each hcd (usb2 and usb3) The structure is picked based on hcd speed, but driver only compared for HCD_USB3 speed, returning the wrong bus_state for HCD_USB31 hosts. This caused null pointer dereference errors in bus_resume function. Signed-off-by: Mathias Nyman Signed-off-by: Greg Kroah-Hartman --- drivers/usb/host/xhci.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/host/xhci.h b/drivers/usb/host/xhci.h index fc2ee6c272c4..1715705acc59 100644 --- a/drivers/usb/host/xhci.h +++ b/drivers/usb/host/xhci.h @@ -1490,7 +1490,7 @@ struct xhci_bus_state { static inline unsigned int hcd_index(struct usb_hcd *hcd) { - if (hcd->speed == HCD_USB3) + if (hcd->speed >= HCD_USB3) return 0; else return 1; From 0141f858d2e137a7de0bbb0fb4a9cfa3108774e8 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:18 +0200 Subject: [PATCH 375/546] iio: adc: twl4030: Fix an error handling path in 'twl4030_madc_probe()' commit 245a396a9b1a67ac5c3228737c261b3e48708a2a upstream. If 'devm_regulator_get()' fails, we should go through the existing error handling path instead of returning directly, as done is all the other error handling paths in this function. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/twl4030-madc.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 0c74869a540a..79028c95b673 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -866,8 +866,10 @@ static int twl4030_madc_probe(struct platform_device *pdev) /* Enable 3v1 bias regulator for MADC[3:6] */ madc->usb3v1 = devm_regulator_get(madc->dev, "vusb3v1"); - if (IS_ERR(madc->usb3v1)) - return -ENODEV; + if (IS_ERR(madc->usb3v1)) { + ret = -ENODEV; + goto err_i2c; + } ret = regulator_enable(madc->usb3v1); if (ret) From 0bab54141bac4025534795eb83c70a69ce6e91b3 Mon Sep 17 00:00:00 2001 From: Christophe JAILLET Date: Sat, 23 Sep 2017 08:06:19 +0200 Subject: [PATCH 376/546] iio: adc: twl4030: Disable the vusb3v1 rugulator in the error handling path of 'twl4030_madc_probe()' commit 7f70be6e4025db0551e6863e7eb9cca07122695c upstream. Commit 7cc97d77ee8a has introduced a call to 'regulator_disable()' in the .remove function. So we should also have such a call in the .probe function in case of error after a successful 'regulator_enable()' call. Add a new label for that and use it. Fixes: 7cc97d77ee8a ("iio: adc: twl4030: Fix ADC[3:6] readings") Signed-off-by: Christophe JAILLET Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/twl4030-madc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/adc/twl4030-madc.c b/drivers/iio/adc/twl4030-madc.c index 79028c95b673..7ffc5db4d7ee 100644 --- a/drivers/iio/adc/twl4030-madc.c +++ b/drivers/iio/adc/twl4030-madc.c @@ -878,11 +878,13 @@ static int twl4030_madc_probe(struct platform_device *pdev) ret = iio_device_register(iio_dev); if (ret) { dev_err(&pdev->dev, "could not register iio device\n"); - goto err_i2c; + goto err_usb3v1; } return 0; +err_usb3v1: + regulator_disable(madc->usb3v1); err_i2c: twl4030_madc_set_current_generator(madc, 0, 0); err_current_generator: From 4b9c62a00aeae875cecbc9ac67753534e2681e4b Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:14:45 +0300 Subject: [PATCH 377/546] iio: ad_sigma_delta: Implement a dedicated reset function commit 7fc10de8d49a748c476532c9d8e8fe19e548dd67 upstream. Since most of the SD ADCs have the option of reseting the serial interface by sending a number of SCLKs with CS = 0 and DIN = 1, a dedicated function that can do this is usefull. Needed for the patch: iio: ad7793: Fix the serial interface reset Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ad_sigma_delta.c | 28 ++++++++++++++++++++++++++ include/linux/iio/adc/ad_sigma_delta.h | 3 +++ 2 files changed, 31 insertions(+) diff --git a/drivers/iio/adc/ad_sigma_delta.c b/drivers/iio/adc/ad_sigma_delta.c index d10bd0c97233..22c4c17cd996 100644 --- a/drivers/iio/adc/ad_sigma_delta.c +++ b/drivers/iio/adc/ad_sigma_delta.c @@ -177,6 +177,34 @@ int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, } EXPORT_SYMBOL_GPL(ad_sd_read_reg); +/** + * ad_sd_reset() - Reset the serial interface + * + * @sigma_delta: The sigma delta device + * @reset_length: Number of SCLKs with DIN = 1 + * + * Returns 0 on success, an error code otherwise. + **/ +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length) +{ + uint8_t *buf; + unsigned int size; + int ret; + + size = DIV_ROUND_UP(reset_length, 8); + buf = kcalloc(size, sizeof(*buf), GFP_KERNEL); + if (!buf) + return -ENOMEM; + + memset(buf, 0xff, size); + ret = spi_write(sigma_delta->spi, buf, size); + kfree(buf); + + return ret; +} +EXPORT_SYMBOL_GPL(ad_sd_reset); + static int ad_sd_calibrate(struct ad_sigma_delta *sigma_delta, unsigned int mode, unsigned int channel) { diff --git a/include/linux/iio/adc/ad_sigma_delta.h b/include/linux/iio/adc/ad_sigma_delta.h index e7fdec4db9da..6cc48ac55fd2 100644 --- a/include/linux/iio/adc/ad_sigma_delta.h +++ b/include/linux/iio/adc/ad_sigma_delta.h @@ -111,6 +111,9 @@ int ad_sd_write_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, int ad_sd_read_reg(struct ad_sigma_delta *sigma_delta, unsigned int reg, unsigned int size, unsigned int *val); +int ad_sd_reset(struct ad_sigma_delta *sigma_delta, + unsigned int reset_length); + int ad_sigma_delta_single_conversion(struct iio_dev *indio_dev, const struct iio_chan_spec *chan, int *val); int ad_sd_calibrate_all(struct ad_sigma_delta *sigma_delta, From b86df98578ab321655a74e00f755e3636dcf5747 Mon Sep 17 00:00:00 2001 From: Stefan Popa Date: Thu, 14 Sep 2017 16:50:28 +0300 Subject: [PATCH 378/546] staging: iio: ad7192: Fix - use the dedicated reset function avoiding dma from stack. commit f790923f146140a261ad211e5baf75d169f16fb2 upstream. Depends on: 691c4b95d1 ("iio: ad_sigma_delta: Implement a dedicated reset function") SPI host drivers can use DMA to transfer data, so the buffer should be properly allocated. Keeping it on the stack could cause an undefined behavior. The dedicated reset function solves this issue. Signed-off-by: Stefan Popa Acked-by: Lars-Peter Clausen Acked-by: Michael Hennerich Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/staging/iio/adc/ad7192.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/staging/iio/adc/ad7192.c b/drivers/staging/iio/adc/ad7192.c index 20314ff08be0..abc66908681d 100644 --- a/drivers/staging/iio/adc/ad7192.c +++ b/drivers/staging/iio/adc/ad7192.c @@ -205,11 +205,9 @@ static int ad7192_setup(struct ad7192_state *st, struct iio_dev *indio_dev = spi_get_drvdata(st->sd.spi); unsigned long long scale_uv; int i, ret, id; - u8 ones[6]; /* reset the serial interface */ - memset(&ones, 0xFF, 6); - ret = spi_write(st->sd.spi, &ones, 6); + ret = ad_sd_reset(&st->sd, 48); if (ret < 0) goto out; usleep_range(500, 1000); /* Wait for at least 500us */ From 2c29a386809087a98e97c9775f1febdc9de6ab02 Mon Sep 17 00:00:00 2001 From: Matt Fornero Date: Tue, 5 Sep 2017 16:34:10 +0200 Subject: [PATCH 379/546] iio: core: Return error for failed read_reg commit 3d62c78a6eb9a7d67bace9622b66ad51e81c5f9b upstream. If an IIO device returns an error code for a read access via debugfs, it is currently ignored by the IIO core (other than emitting an error message). Instead, return this error code to user space, so upper layers can detect it correctly. Signed-off-by: Matt Fornero Signed-off-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/industrialio-core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/iio/industrialio-core.c b/drivers/iio/industrialio-core.c index 131b434af994..e08a3c794120 100644 --- a/drivers/iio/industrialio-core.c +++ b/drivers/iio/industrialio-core.c @@ -221,8 +221,10 @@ static ssize_t iio_debugfs_read_reg(struct file *file, char __user *userbuf, ret = indio_dev->info->debugfs_reg_access(indio_dev, indio_dev->cached_reg_addr, 0, &val); - if (ret) + if (ret) { dev_err(indio_dev->dev.parent, "%s: read failed\n", __func__); + return ret; + } len = snprintf(buf, sizeof(buf), "0x%X\n", val); From f2f68ec0b2847b38d5d0dcb64470a45d9c96edf7 Mon Sep 17 00:00:00 2001 From: Dragos Bogdan Date: Tue, 5 Sep 2017 15:16:13 +0300 Subject: [PATCH 380/546] iio: ad7793: Fix the serial interface reset commit 7ee3b7ebcb74714df6d94c8f500f307e1ee5dda5 upstream. The serial interface can be reset by writing 32 consecutive 1s to the device. 'ret' was initialized correctly but its value was overwritten when ad7793_check_platform_data() was called. Since a dedicated reset function is present now, it should be used instead. Fixes: 2edb769d246e ("iio:ad7793: Add support for the ad7798 and ad7799") Signed-off-by: Dragos Bogdan Acked-by: Lars-Peter Clausen Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/ad7793.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/iio/adc/ad7793.c b/drivers/iio/adc/ad7793.c index 4d960d3b93c0..91d34ed756ea 100644 --- a/drivers/iio/adc/ad7793.c +++ b/drivers/iio/adc/ad7793.c @@ -257,7 +257,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, unsigned int vref_mv) { struct ad7793_state *st = iio_priv(indio_dev); - int i, ret = -1; + int i, ret; unsigned long long scale_uv; u32 id; @@ -266,7 +266,7 @@ static int ad7793_setup(struct iio_dev *indio_dev, return ret; /* reset the serial interface */ - ret = spi_write(st->sd.spi, (u8 *)&ret, sizeof(ret)); + ret = ad_sd_reset(&st->sd, 32); if (ret < 0) goto out; usleep_range(500, 2000); /* Wait for at least 500us */ From 18215da0c24117da53b164467b89f5dc350b4d0b Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: [PATCH 381/546] iio: adc: mcp320x: Fix readout of negative voltages commit e6f4794371ee7cce1339e7ca9542f1e703c5f84a upstream. Commit f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") returns a signed voltage from mcp320x_adc_conversion() but neglects that the caller interprets a negative return value as failure. Only mcp3301 (and the upcoming mcp3550/1/3) is affected as the other chips are incapable of measuring negative voltages. Fix and while at it, add mcp3301 to the list of supported chips at the top of the file. Fixes: f686a36b4b79 ("iio: adc: mcp320x: Add support for mcp3301") Cc: Andrea Galbusera Signed-off-by: Lukas Wunner Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/mcp320x.c | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 8569c8e1f4b2..5c51f6f5cbff 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -17,6 +17,8 @@ * MCP3204 * MCP3208 * ------------ + * 13 bit converter + * MCP3301 * * Datasheet can be found here: * http://ww1.microchip.com/downloads/en/DeviceDoc/21293C.pdf mcp3001 @@ -96,7 +98,7 @@ static int mcp320x_channel_to_tx_data(int device_index, } static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, - bool differential, int device_index) + bool differential, int device_index, int *val) { int ret; @@ -117,19 +119,25 @@ static int mcp320x_adc_conversion(struct mcp320x *adc, u8 channel, switch (device_index) { case mcp3001: - return (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + *val = (adc->rx_buf[0] << 5 | adc->rx_buf[1] >> 3); + return 0; case mcp3002: case mcp3004: case mcp3008: - return (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + *val = (adc->rx_buf[0] << 2 | adc->rx_buf[1] >> 6); + return 0; case mcp3201: - return (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + *val = (adc->rx_buf[0] << 7 | adc->rx_buf[1] >> 1); + return 0; case mcp3202: case mcp3204: case mcp3208: - return (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + *val = (adc->rx_buf[0] << 4 | adc->rx_buf[1] >> 4); + return 0; case mcp3301: - return sign_extend32((adc->rx_buf[0] & 0x1f) << 8 | adc->rx_buf[1], 12); + *val = sign_extend32((adc->rx_buf[0] & 0x1f) << 8 + | adc->rx_buf[1], 12); + return 0; default: return -EINVAL; } @@ -150,12 +158,10 @@ static int mcp320x_read_raw(struct iio_dev *indio_dev, switch (mask) { case IIO_CHAN_INFO_RAW: ret = mcp320x_adc_conversion(adc, channel->address, - channel->differential, device_index); - + channel->differential, device_index, val); if (ret < 0) goto out; - *val = ret; ret = IIO_VAL_INT; break; From 8b4196420dd6a60c75b5d1d346a91d87cc013a0f Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Tue, 22 Aug 2017 15:33:00 +0200 Subject: [PATCH 382/546] iio: adc: mcp320x: Fix oops on module unload commit 0964e40947a630a2a6f724e968246992f97bcf1c upstream. The driver calls spi_get_drvdata() in its ->remove hook even though it has never called spi_set_drvdata(). Stack trace for posterity: Unable to handle kernel NULL pointer dereference at virtual address 00000220 Internal error: Oops: 5 [#1] SMP ARM [<8072f564>] (mutex_lock) from [<7f1400d0>] (iio_device_unregister+0x24/0x7c [industrialio]) [<7f1400d0>] (iio_device_unregister [industrialio]) from [<7f15e020>] (mcp320x_remove+0x20/0x30 [mcp320x]) [<7f15e020>] (mcp320x_remove [mcp320x]) from [<8055a8cc>] (spi_drv_remove+0x2c/0x44) [<8055a8cc>] (spi_drv_remove) from [<805087bc>] (__device_release_driver+0x98/0x134) [<805087bc>] (__device_release_driver) from [<80509180>] (driver_detach+0xdc/0xe0) [<80509180>] (driver_detach) from [<8050823c>] (bus_remove_driver+0x5c/0xb0) [<8050823c>] (bus_remove_driver) from [<80509ab0>] (driver_unregister+0x38/0x58) [<80509ab0>] (driver_unregister) from [<7f15e69c>] (mcp320x_driver_exit+0x14/0x1c [mcp320x]) [<7f15e69c>] (mcp320x_driver_exit [mcp320x]) from [<801a78d0>] (SyS_delete_module+0x184/0x1d0) [<801a78d0>] (SyS_delete_module) from [<80108100>] (ret_fast_syscall+0x0/0x1c) Fixes: f5ce4a7a9291 ("iio: adc: add driver for MCP3204/08 12-bit ADC") Cc: Oskar Andero Signed-off-by: Lukas Wunner Signed-off-by: Jonathan Cameron Signed-off-by: Greg Kroah-Hartman --- drivers/iio/adc/mcp320x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iio/adc/mcp320x.c b/drivers/iio/adc/mcp320x.c index 5c51f6f5cbff..ad2681acce9a 100644 --- a/drivers/iio/adc/mcp320x.c +++ b/drivers/iio/adc/mcp320x.c @@ -310,6 +310,7 @@ static int mcp320x_probe(struct spi_device *spi) indio_dev->name = spi_get_device_id(spi)->name; indio_dev->modes = INDIO_DIRECT_MODE; indio_dev->info = &mcp320x_info; + spi_set_drvdata(spi, indio_dev); chip_info = &mcp320x_chip_infos[spi_get_device_id(spi)->driver_data]; indio_dev->channels = chip_info->channels; From 5a21af11c6810b936b17a2c5c69518be0da8f4c3 Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 14:30:55 +0200 Subject: [PATCH 383/546] uwb: properly check kthread_run return value commit bbf26183b7a6236ba602f4d6a2f7cade35bba043 upstream. uwbd_start() calls kthread_run() and checks that the return value is not NULL. But the return value is not NULL in case kthread_run() fails, it takes the form of ERR_PTR(-EINTR). Use IS_ERR() instead. Also add a check to uwbd_stop(). Signed-off-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/uwbd.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/uwb/uwbd.c b/drivers/uwb/uwbd.c index bdcb13cc1d54..5c9828370217 100644 --- a/drivers/uwb/uwbd.c +++ b/drivers/uwb/uwbd.c @@ -303,18 +303,22 @@ static int uwbd(void *param) /** Start the UWB daemon */ void uwbd_start(struct uwb_rc *rc) { - rc->uwbd.task = kthread_run(uwbd, rc, "uwbd"); - if (rc->uwbd.task == NULL) + struct task_struct *task = kthread_run(uwbd, rc, "uwbd"); + if (IS_ERR(task)) { + rc->uwbd.task = NULL; printk(KERN_ERR "UWB: Cannot start management daemon; " "UWB won't work\n"); - else + } else { + rc->uwbd.task = task; rc->uwbd.pid = rc->uwbd.task->pid; + } } /* Stop the UWB daemon and free any unprocessed events */ void uwbd_stop(struct uwb_rc *rc) { - kthread_stop(rc->uwbd.task); + if (rc->uwbd.task) + kthread_stop(rc->uwbd.task); uwbd_flush(rc); } From ee5bd0e4e69fcf8e4194d9ad7de2241346c9bbbf Mon Sep 17 00:00:00 2001 From: Andrey Konovalov Date: Thu, 14 Sep 2017 16:52:59 +0200 Subject: [PATCH 384/546] uwb: ensure that endpoint is interrupt commit 70e743e4cec3733dc13559f6184b35d358b9ef3f upstream. hwarc_neep_init() assumes that endpoint 0 is interrupt, but there's no check for that, which results in a WARNING in USB core code, when a bad USB descriptor is provided from a device: usb 1-1: BOGUS urb xfer, pipe 1 != type 3 ------------[ cut here ]------------ WARNING: CPU: 0 PID: 3 at drivers/usb/core/urb.c:449 usb_submit_urb+0xf8a/0x11d0 Modules linked in: CPU: 0 PID: 3 Comm: kworker/0:0 Not tainted 4.13.0+ #111 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event task: ffff88006bdc1a00 task.stack: ffff88006bde8000 RIP: 0010:usb_submit_urb+0xf8a/0x11d0 drivers/usb/core/urb.c:448 RSP: 0018:ffff88006bdee3c0 EFLAGS: 00010282 RAX: 0000000000000029 RBX: ffff8800672a7200 RCX: 0000000000000000 RDX: 0000000000000029 RSI: ffff88006c815c78 RDI: ffffed000d7bdc6a RBP: ffff88006bdee4c0 R08: fffffbfff0fe00ff R09: fffffbfff0fe00ff R10: 0000000000000018 R11: fffffbfff0fe00fe R12: 1ffff1000d7bdc7f R13: 0000000000000003 R14: 0000000000000001 R15: ffff88006b02cc90 FS: 0000000000000000(0000) GS:ffff88006c800000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007fe4daddf000 CR3: 000000006add6000 CR4: 00000000000006f0 Call Trace: hwarc_neep_init+0x4ce/0x9c0 drivers/uwb/hwa-rc.c:710 uwb_rc_add+0x2fb/0x730 drivers/uwb/lc-rc.c:361 hwarc_probe+0x34e/0x9b0 drivers/uwb/hwa-rc.c:858 usb_probe_interface+0x351/0x8d0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_set_configuration+0x1064/0x1890 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:385 driver_probe_device+0x610/0xa00 drivers/base/dd.c:529 __device_attach_driver+0x230/0x290 drivers/base/dd.c:625 bus_for_each_drv+0x15e/0x210 drivers/base/bus.c:463 __device_attach+0x269/0x3c0 drivers/base/dd.c:682 device_initial_probe+0x1f/0x30 drivers/base/dd.c:729 bus_probe_device+0x1da/0x280 drivers/base/bus.c:523 device_add+0xcf9/0x1640 drivers/base/core.c:1703 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4890 hub_port_connect_change drivers/usb/core/hub.c:4996 port_event drivers/usb/core/hub.c:5102 hub_event+0x23c8/0x37c0 drivers/usb/core/hub.c:5182 process_one_work+0x9fb/0x1570 kernel/workqueue.c:2097 worker_thread+0x1e4/0x1350 kernel/workqueue.c:2231 kthread+0x324/0x3f0 kernel/kthread.c:231 ret_from_fork+0x25/0x30 arch/x86/entry/entry_64.S:425 Code: 48 8b 85 30 ff ff ff 48 8d b8 98 00 00 00 e8 8e 93 07 ff 45 89 e8 44 89 f1 4c 89 fa 48 89 c6 48 c7 c7 a0 e5 55 86 e8 20 08 8f fd <0f> ff e9 9b f7 ff ff e8 4a 04 d6 fd e9 80 f7 ff ff e8 60 11 a6 ---[ end trace 55d741234124cfc3 ]--- Check that endpoint is interrupt. Found by syzkaller. Signed-off-by: Andrey Konovalov Signed-off-by: Greg Kroah-Hartman --- drivers/uwb/hwa-rc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/uwb/hwa-rc.c b/drivers/uwb/hwa-rc.c index e75bbe5a10cd..1212b4b3c5a9 100644 --- a/drivers/uwb/hwa-rc.c +++ b/drivers/uwb/hwa-rc.c @@ -827,6 +827,8 @@ static int hwarc_probe(struct usb_interface *iface, if (iface->cur_altsetting->desc.bNumEndpoints < 1) return -ENODEV; + if (!usb_endpoint_xfer_int(&iface->cur_altsetting->endpoint[0].desc)) + return -ENODEV; result = -ENOMEM; uwb_rc = uwb_rc_alloc(); From b8af4466255c0898c6ba14b729926d610a89d86a Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:54 +0200 Subject: [PATCH 385/546] brcmfmac: setup passive scan if requested by user-space commit 35f62727df0ed8e5e4857e162d94fd46d861f1cf upstream. The driver was not properly configuring firmware with regard to the type of scan. It always performed an active scan even when user-space was requesting for passive scan, ie. the scan request was done without any SSIDs specified. Reported-by: Huang, Jiangyang Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- .../wireless/brcm80211/brcmfmac/cfg80211.c | 19 ++++--------------- .../wireless/brcm80211/brcmfmac/fwil_types.h | 5 +++++ 2 files changed, 9 insertions(+), 15 deletions(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c index da5826d788d6..f18491cf793c 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c @@ -876,7 +876,7 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, eth_broadcast_addr(params_le->bssid); params_le->bss_type = DOT11_BSSTYPE_ANY; - params_le->scan_type = 0; + params_le->scan_type = BRCMF_SCANTYPE_ACTIVE; params_le->channel_num = 0; params_le->nprobes = cpu_to_le32(-1); params_le->active_time = cpu_to_le32(-1); @@ -884,12 +884,9 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, params_le->home_time = cpu_to_le32(-1); memset(¶ms_le->ssid_le, 0, sizeof(params_le->ssid_le)); - /* if request is null exit so it will be all channel broadcast scan */ - if (!request) - return; - n_ssids = request->n_ssids; n_channels = request->n_channels; + /* Copy channel array if applicable */ brcmf_dbg(SCAN, "### List of channelspecs to scan ### %d\n", n_channels); @@ -926,16 +923,8 @@ static void brcmf_escan_prep(struct brcmf_cfg80211_info *cfg, ptr += sizeof(ssid_le); } } else { - brcmf_dbg(SCAN, "Broadcast scan %p\n", request->ssids); - if ((request->ssids) && request->ssids->ssid_len) { - brcmf_dbg(SCAN, "SSID %s len=%d\n", - params_le->ssid_le.SSID, - request->ssids->ssid_len); - params_le->ssid_le.SSID_len = - cpu_to_le32(request->ssids->ssid_len); - memcpy(¶ms_le->ssid_le.SSID, request->ssids->ssid, - request->ssids->ssid_len); - } + brcmf_dbg(SCAN, "Performing passive scan\n"); + params_le->scan_type = BRCMF_SCANTYPE_PASSIVE; } /* Adding mask to channel numbers */ params_le->channel_num = diff --git a/drivers/net/wireless/brcm80211/brcmfmac/fwil_types.h b/drivers/net/wireless/brcm80211/brcmfmac/fwil_types.h index daa427b46712..4320c4cae53e 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/fwil_types.h +++ b/drivers/net/wireless/brcm80211/brcmfmac/fwil_types.h @@ -45,6 +45,11 @@ #define BRCMF_SCAN_PARAMS_COUNT_MASK 0x0000ffff #define BRCMF_SCAN_PARAMS_NSSID_SHIFT 16 +/* scan type definitions */ +#define BRCMF_SCANTYPE_DEFAULT 0xFF +#define BRCMF_SCANTYPE_ACTIVE 0 +#define BRCMF_SCANTYPE_PASSIVE 1 + /* primary (ie tx) key */ #define BRCMF_PRIMARY_KEY (1 << 1) #define DOT11_BSSTYPE_ANY 2 From 33d1fa43aad4ca11f5d01ede363c1dbdd2010540 Mon Sep 17 00:00:00 2001 From: Jani Nikula Date: Thu, 21 Sep 2017 17:19:20 +0300 Subject: [PATCH 386/546] drm/i915/bios: ignore HDMI on port A MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 2ba7d7e0437127314864238f8bfcb8369d81075c upstream. The hardware state readout oopses after several warnings when trying to use HDMI on port A, if such a combination is configured in VBT. Filter the combo out already at the VBT parsing phase. v2: also ignore DVI (Ville) Bugzilla: https://bugs.freedesktop.org/show_bug.cgi?id=102889 Cc: Imre Deak Reviewed-by: Ville Syrjälä Tested-by: Daniel Drake Signed-off-by: Jani Nikula Link: https://patchwork.freedesktop.org/patch/msgid/20170921141920.18172-1-jani.nikula@intel.com (cherry picked from commit d27ffc1d00327c29b3aa97f941b42f0949f9e99f) Signed-off-by: Rodrigo Vivi Signed-off-by: Greg Kroah-Hartman --- drivers/gpu/drm/i915/intel_bios.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/drivers/gpu/drm/i915/intel_bios.c b/drivers/gpu/drm/i915/intel_bios.c index d14bdc537587..0a2ac3efd04e 100644 --- a/drivers/gpu/drm/i915/intel_bios.c +++ b/drivers/gpu/drm/i915/intel_bios.c @@ -957,6 +957,13 @@ static void parse_ddi_port(struct drm_i915_private *dev_priv, enum port port, is_hdmi = is_dvi && (child->common.device_type & DEVICE_TYPE_NOT_HDMI_OUTPUT) == 0; is_edp = is_dp && (child->common.device_type & DEVICE_TYPE_INTERNAL_CONNECTOR); + if (port == PORT_A && is_dvi) { + DRM_DEBUG_KMS("VBT claims port A supports DVI%s, ignoring\n", + is_hdmi ? "/HDMI" : ""); + is_dvi = false; + is_hdmi = false; + } + info->supports_dvi = is_dvi; info->supports_hdmi = is_hdmi; info->supports_dp = is_dp; From 6d1400b09f99cfee275f9f0fd5e58d4fb1a1f1f0 Mon Sep 17 00:00:00 2001 From: Keith Busch Date: Thu, 26 Nov 2015 12:21:29 +0100 Subject: [PATCH 387/546] nvme: protect against simultaneous shutdown invocations commit 77bf25ea70200cddf083f74b7f617e5f07fac8bd upstream. [Back-ported to 4.4. The difference is the file location of the struct definition that's adding the mutex. This fixes reported kernel panics in 4.4-stable from simultaneous controller resets that was never supposed to be allowed to happen.] Signed-off-by: Keith Busch [hch: split from a larger patch] Signed-off-by: Christoph Hellwig Signed-off-by: Jens Axboe Signed-off-by: Greg Kroah-Hartman --- drivers/nvme/host/nvme.h | 2 ++ drivers/nvme/host/pci.c | 3 +++ 2 files changed, 5 insertions(+) diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index b8a5a8e8f57d..88cf4f5025b0 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -14,6 +14,7 @@ #ifndef _NVME_H #define _NVME_H +#include #include #include #include @@ -62,6 +63,7 @@ struct nvme_dev { struct work_struct reset_work; struct work_struct probe_work; struct work_struct scan_work; + struct mutex shutdown_lock; char name[12]; char serial[20]; char model[40]; diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index 4c673d45f1bd..669edbd47602 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -2954,6 +2954,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) nvme_dev_list_remove(dev); + mutex_lock(&dev->shutdown_lock); if (pci_is_enabled(to_pci_dev(dev->dev))) { nvme_freeze_queues(dev); csts = readl(&dev->bar->csts); @@ -2972,6 +2973,7 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); + mutex_unlock(&dev->shutdown_lock); } static void nvme_dev_remove(struct nvme_dev *dev) @@ -3328,6 +3330,7 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) INIT_LIST_HEAD(&dev->namespaces); INIT_WORK(&dev->reset_work, nvme_reset_work); + mutex_init(&dev->shutdown_lock); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); From 90fd6738731b6d105fc8f04832ae17a9ac82c05c Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Thu, 7 Sep 2017 11:13:38 +0200 Subject: [PATCH 388/546] sched/cpuset/pm: Fix cpuset vs. suspend-resume bugs commit 50e76632339d4655859523a39249dd95ee5e93e7 upstream. Cpusets vs. suspend-resume is _completely_ broken. And it got noticed because it now resulted in non-cpuset usage breaking too. On suspend cpuset_cpu_inactive() doesn't call into cpuset_update_active_cpus() because it doesn't want to move tasks about, there is no need, all tasks are frozen and won't run again until after we've resumed everything. But this means that when we finally do call into cpuset_update_active_cpus() after resuming the last frozen cpu in cpuset_cpu_active(), the top_cpuset will not have any difference with the cpu_active_mask and this it will not in fact do _anything_. So the cpuset configuration will not be restored. This was largely hidden because we would unconditionally create identity domains and mobile users would not in fact use cpusets much. And servers what do use cpusets tend to not suspend-resume much. An addition problem is that we'd not in fact wait for the cpuset work to finish before resuming the tasks, allowing spurious migrations outside of the specified domains. Fix the rebuild by introducing cpuset_force_rebuild() and fix the ordering with cpuset_wait_for_hotplug(). Reported-by: Andy Lutomirski Signed-off-by: Peter Zijlstra (Intel) Cc: Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Mike Galbraith Cc: Peter Zijlstra Cc: Rafael J. Wysocki Cc: Tejun Heo Cc: Thomas Gleixner Fixes: deb7aa308ea2 ("cpuset: reorganize CPU / memory hotplug handling") Link: http://lkml.kernel.org/r/20170907091338.orwxrqkbfkki3c24@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar Signed-off-by: Mike Galbraith Signed-off-by: Greg Kroah-Hartman --- include/linux/cpuset.h | 6 ++++++ kernel/cpuset.c | 16 +++++++++++++++- kernel/power/process.c | 5 ++++- kernel/sched/core.c | 7 +++---- 4 files changed, 28 insertions(+), 6 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 8397dc235e84..ad98acfbcba8 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -43,7 +43,9 @@ static inline void cpuset_dec(void) extern int cpuset_init(void); extern void cpuset_init_smp(void); +extern void cpuset_force_rebuild(void); extern void cpuset_update_active_cpus(bool cpu_online); +extern void cpuset_wait_for_hotplug(void); extern void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask); extern void cpuset_cpus_allowed_fallback(struct task_struct *p); extern nodemask_t cpuset_mems_allowed(struct task_struct *p); @@ -147,11 +149,15 @@ static inline bool cpusets_enabled(void) { return false; } static inline int cpuset_init(void) { return 0; } static inline void cpuset_init_smp(void) {} +static inline void cpuset_force_rebuild(void) { } + static inline void cpuset_update_active_cpus(bool cpu_online) { partition_sched_domains(1, NULL, NULL); } +static inline void cpuset_wait_for_hotplug(void) { } + static inline void cpuset_cpus_allowed(struct task_struct *p, struct cpumask *mask) { diff --git a/kernel/cpuset.c b/kernel/cpuset.c index 2924b6faa469..dd3ae6ee064d 100644 --- a/kernel/cpuset.c +++ b/kernel/cpuset.c @@ -2281,6 +2281,13 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs) mutex_unlock(&cpuset_mutex); } +static bool force_rebuild; + +void cpuset_force_rebuild(void) +{ + force_rebuild = true; +} + /** * cpuset_hotplug_workfn - handle CPU/memory hotunplug for a cpuset * @@ -2355,8 +2362,10 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } /* rebuild sched domains if cpus_allowed has changed */ - if (cpus_updated) + if (cpus_updated || force_rebuild) { + force_rebuild = false; rebuild_sched_domains(); + } } void cpuset_update_active_cpus(bool cpu_online) @@ -2375,6 +2384,11 @@ void cpuset_update_active_cpus(bool cpu_online) schedule_work(&cpuset_hotplug_work); } +void cpuset_wait_for_hotplug(void) +{ + flush_work(&cpuset_hotplug_work); +} + /* * Keep top_cpuset.mems_allowed tracking node_states[N_MEMORY]. * Call this routine anytime after node_states[N_MEMORY] changes. diff --git a/kernel/power/process.c b/kernel/power/process.c index 564f786df470..ba2029a02259 100644 --- a/kernel/power/process.c +++ b/kernel/power/process.c @@ -18,8 +18,9 @@ #include #include #include +#include -/* +/* * Timeout for stopping processes */ unsigned int __read_mostly freeze_timeout_msecs = 20 * MSEC_PER_SEC; @@ -198,6 +199,8 @@ void thaw_processes(void) __usermodehelper_set_disable_depth(UMH_FREEZING); thaw_workqueues(); + cpuset_wait_for_hotplug(); + read_lock(&tasklist_lock); for_each_process_thread(g, p) { /* No other threads should have PF_SUSPEND_TASK set */ diff --git a/kernel/sched/core.c b/kernel/sched/core.c index dece705b7f8c..b5d372083624 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7286,17 +7286,16 @@ static int cpuset_cpu_active(struct notifier_block *nfb, unsigned long action, * operation in the resume sequence, just build a single sched * domain, ignoring cpusets. */ - num_cpus_frozen--; - if (likely(num_cpus_frozen)) { - partition_sched_domains(1, NULL, NULL); + partition_sched_domains(1, NULL, NULL); + if (--num_cpus_frozen) break; - } /* * This is the last CPU online operation. So fall through and * restore the original sched domains by considering the * cpuset configurations. */ + cpuset_force_rebuild(); case CPU_ONLINE: cpuset_update_active_cpus(true); From 40c00e5fac3abbd5e6fa08e93fa99b3e632ece16 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Fri, 26 May 2017 17:45:45 -0400 Subject: [PATCH 389/546] ext4: fix data corruption for mmap writes commit a056bdaae7a181f7dcc876cfab2f94538e508709 upstream. mpage_submit_page() can race with another process growing i_size and writing data via mmap to the written-back page. As mpage_submit_page() samples i_size too early, it may happen that ext4_bio_write_page() zeroes out too large tail of the page and thus corrupts user data. Fix the problem by sampling i_size only after the page has been write-protected in page tables by clear_page_dirty_for_io() call. Reported-by: Michael Zimmer CC: stable@vger.kernel.org Fixes: cb20d5188366f04d96d2e07b1240cc92170ade40 Signed-off-by: Jan Kara Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/inode.c | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 1796d1bd9a1d..194a6baa4283 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1946,15 +1946,29 @@ static int ext4_writepage(struct page *page, static int mpage_submit_page(struct mpage_da_data *mpd, struct page *page) { int len; - loff_t size = i_size_read(mpd->inode); + loff_t size; int err; BUG_ON(page->index != mpd->first_page); - if (page->index == size >> PAGE_CACHE_SHIFT) - len = size & ~PAGE_CACHE_MASK; - else - len = PAGE_CACHE_SIZE; clear_page_dirty_for_io(page); + /* + * We have to be very careful here! Nothing protects writeback path + * against i_size changes and the page can be writeably mapped into + * page tables. So an application can be growing i_size and writing + * data through mmap while writeback runs. clear_page_dirty_for_io() + * write-protects our page in page tables and the page cannot get + * written to again until we release page lock. So only after + * clear_page_dirty_for_io() we are safe to sample i_size for + * ext4_bio_write_page() to zero-out tail of the written page. We rely + * on the barrier provided by TestClearPageDirty in + * clear_page_dirty_for_io() to make sure i_size is really sampled only + * after page tables are updated. + */ + size = i_size_read(mpd->inode); + if (page->index == size >> PAGE_SHIFT) + len = size & ~PAGE_MASK; + else + len = PAGE_SIZE; err = ext4_bio_write_page(&mpd->io_submit, page, len, mpd->wbc, false); if (!err) mpd->wbc->nr_to_write--; From 4f22f0793ccedb43a8fd47c20943a777228b2265 Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Sun, 30 Jul 2017 23:33:01 -0400 Subject: [PATCH 390/546] ext4: Don't clear SGID when inheriting ACLs commit a3bb2d5587521eea6dab2d05326abb0afb460abd upstream. When new directory 'DIR1' is created in a directory 'DIR0' with SGID bit set, DIR1 is expected to have SGID bit set (and owning group equal to the owning group of 'DIR0'). However when 'DIR0' also has some default ACLs that 'DIR1' inherits, setting these ACLs will result in SGID bit on 'DIR1' to get cleared if user is not member of the owning group. Fix the problem by moving posix_acl_update_mode() out of __ext4_set_acl() into ext4_set_acl(). That way the function will not be called when inheriting ACLs which is what we want as it prevents SGID bit clearing and the mode has been properly set by posix_acl_create() anyway. Fixes: 073931017b49d9458aa351605b43a7e34598caef Signed-off-by: Theodore Ts'o Signed-off-by: Jan Kara Reviewed-by: Andreas Gruenbacher Signed-off-by: Greg Kroah-Hartman --- fs/ext4/acl.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/fs/ext4/acl.c b/fs/ext4/acl.c index c3fe1e323951..ea2ef0eac0c4 100644 --- a/fs/ext4/acl.c +++ b/fs/ext4/acl.c @@ -195,13 +195,6 @@ __ext4_set_acl(handle_t *handle, struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = EXT4_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) - return error; - inode->i_ctime = ext4_current_time(inode); - ext4_mark_inode_dirty(handle, inode); - } break; case ACL_TYPE_DEFAULT: @@ -234,6 +227,8 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) { handle_t *handle; int error, retries = 0; + umode_t mode = inode->i_mode; + int update_mode = 0; retry: handle = ext4_journal_start(inode, EXT4_HT_XATTR, @@ -241,7 +236,20 @@ ext4_set_acl(struct inode *inode, struct posix_acl *acl, int type) if (IS_ERR(handle)) return PTR_ERR(handle); + if ((type == ACL_TYPE_ACCESS) && acl) { + error = posix_acl_update_mode(inode, &mode, &acl); + if (error) + goto out_stop; + update_mode = 1; + } + error = __ext4_set_acl(handle, inode, type, acl); + if (!error && update_mode) { + inode->i_mode = mode; + inode->i_ctime = ext4_current_time(inode); + ext4_mark_inode_dirty(handle, inode); + } +out_stop: ext4_journal_stop(handle); if (error == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; From 82854fb438caaeb84a0b2e47d0b0b8bbc4736aed Mon Sep 17 00:00:00 2001 From: Theodore Ts'o Date: Wed, 28 Dec 2016 00:22:52 -0500 Subject: [PATCH 391/546] ext4: don't allow encrypted operations without keys commit 173b8439e1ba362007315868928bf9d26e5cc5a6 upstream. While we allow deletes without the key, the following should not be permitted: # cd /vdc/encrypted-dir-without-key # ls -l total 4 -rw-r--r-- 1 root root 0 Dec 27 22:35 6,LKNRJsp209FbXoSvJWzB -rw-r--r-- 1 root root 286 Dec 27 22:35 uRJ5vJh9gE7vcomYMqTAyD # mv uRJ5vJh9gE7vcomYMqTAyD 6,LKNRJsp209FbXoSvJWzB Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 1d007e853f5c..6445d84266fa 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -3506,6 +3506,12 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if ((ext4_encrypted_inode(old_dir) && + !ext4_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !ext4_has_encryption_key(new_dir))) + return -ENOKEY; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3706,6 +3712,12 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, u8 new_file_type; int retval; + if ((ext4_encrypted_inode(old_dir) && + !ext4_has_encryption_key(old_dir)) || + (ext4_encrypted_inode(new_dir) && + !ext4_has_encryption_key(new_dir))) + return -ENOKEY; + if ((ext4_encrypted_inode(old_dir) || ext4_encrypted_inode(new_dir)) && (old_dir != new_dir) && From 69f53f5d37d53ba17ca744947226b4cdadb90c13 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 12 Oct 2017 11:27:40 +0200 Subject: [PATCH 392/546] Linux 4.4.92 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index c1db50ef7fb5..fab2d640a27e 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 91 +SUBLEVEL = 92 EXTRAVERSION = NAME = Blurry Fish Butt From 0b0225bf2f14ba8ff475157ca932f158cb7b62af Mon Sep 17 00:00:00 2001 From: Amit Pundir Date: Sat, 14 Oct 2017 18:03:00 +0530 Subject: [PATCH 393/546] LSK-ANDROID: cpufreq-dt: fix incorrect cherry-pick from android-4.4 Fix incorrect cherry-pick of Change-Id: Iec92d710c79d9c10d0bef1b617942185448fc214 ("ANDROID: cpufreq-dt: Set sane defaults for schedutil rate limits"), in lsk-v4.4-android topic branch. Fixes: lsk-v4.4-android commit d5a094a32657 ("ANDROID: cpufreq-dt: Set sane defaults for schedutil rate limits") Signed-off-by: Amit Pundir --- drivers/cpufreq/cpufreq-dt.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/cpufreq/cpufreq-dt.c b/drivers/cpufreq/cpufreq-dt.c index ebe3567c2cda..a72ae98b4838 100644 --- a/drivers/cpufreq/cpufreq-dt.c +++ b/drivers/cpufreq/cpufreq-dt.c @@ -279,6 +279,13 @@ static int cpufreq_init(struct cpufreq_policy *policy) policy->cpuinfo.transition_latency = transition_latency; + /* + * Android: set default parameters for parity between schedutil and + * schedfreq + */ + policy->up_transition_delay_us = transition_latency / NSEC_PER_USEC; + policy->down_transition_delay_us = 50000; /* 50ms */ + return 0; out_free_cpufreq_table: @@ -340,13 +347,6 @@ static void cpufreq_ready(struct cpufreq_policy *policy) } } - /* - * Android: set default parameters for parity between schedutil and - * schedfreq - */ - policy->up_transition_delay_us = transition_latency / NSEC_PER_USEC; - policy->down_transition_delay_us = 50000; /* 50ms */ - of_node_put(np); } From e0e5d1e258e788f844fda35f2cd99102489dd369 Mon Sep 17 00:00:00 2001 From: Michael Ellerman Date: Sat, 25 Jun 2016 21:53:30 +1000 Subject: [PATCH 394/546] UPSTREAM: Fix build break in fork.c when THREAD_SIZE < PAGE_SIZE Commit b235beea9e99 ("Clarify naming of thread info/stack allocators") breaks the build on some powerpc configs, where THREAD_SIZE < PAGE_SIZE: kernel/fork.c:235:2: error: implicit declaration of function 'free_thread_stack' kernel/fork.c:355:8: error: assignment from incompatible pointer type stack = alloc_thread_stack_node(tsk, node); ^ Fix it by renaming free_stack() to free_thread_stack(), and updating the return type of alloc_thread_stack_node(). Fixes: b235beea9e99 ("Clarify naming of thread info/stack allocators") Signed-off-by: Michael Ellerman Signed-off-by: Linus Torvalds Bug: 38331309 Change-Id: I5b7f920b459fb84adf5fc75f83bb488b855c4deb (cherry picked from commit 9521d39976db20f8ef9b56af66661482a17d5364) Signed-off-by: Zubin Mithra --- kernel/fork.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/fork.c b/kernel/fork.c index 5ee818516a1c..f98b3360397c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -176,13 +176,13 @@ static inline void free_thread_stack(unsigned long *stack) # else static struct kmem_cache *thread_stack_cache; -static struct thread_info *alloc_thread_stack_node(struct task_struct *tsk, +static unsigned long *alloc_thread_stack_node(struct task_struct *tsk, int node) { return kmem_cache_alloc_node(thread_stack_cache, THREADINFO_GFP, node); } -static void free_stack(unsigned long *stack) +static void free_thread_stack(unsigned long *stack) { kmem_cache_free(thread_stack_cache, stack); } From 500da77e50728550d7d41ee25095dd7711e6fb5e Mon Sep 17 00:00:00 2001 From: gaurav jindal Date: Mon, 18 Sep 2017 00:58:43 +0530 Subject: [PATCH 395/546] drivers: cpufreq_interactive: handle error for module load fail If the cpufreq_register_governor fails, resources for thread speedchange_task should be released. currently, concerned resources are released in module_exit, but if module loading fails, exit will not be called and resources will remain acquired. this may leave kernel in an unstable state. Change-Id: Ic33f058c069d30bfd114fa1c1380325c8e00b51c Signed-off-by: gaurav jindal --- drivers/cpufreq/cpufreq_interactive.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_interactive.c b/drivers/cpufreq/cpufreq_interactive.c index 52ef2ea6b65c..e0a586895136 100644 --- a/drivers/cpufreq/cpufreq_interactive.c +++ b/drivers/cpufreq/cpufreq_interactive.c @@ -1316,6 +1316,7 @@ static int __init cpufreq_interactive_init(void) unsigned int i; struct cpufreq_interactive_cpuinfo *pcpu; struct sched_param param = { .sched_priority = MAX_RT_PRIO-1 }; + int ret = 0; /* Initalize per-cpu timers */ for_each_possible_cpu(i) { @@ -1344,7 +1345,12 @@ static int __init cpufreq_interactive_init(void) /* NB: wake up so the thread does not look hung to the freezer */ wake_up_process(speedchange_task); - return cpufreq_register_governor(&cpufreq_gov_interactive); + ret = cpufreq_register_governor(&cpufreq_gov_interactive); + if (ret) { + kthread_stop(speedchange_task); + put_task_struct(speedchange_task); + } + return ret; } #ifdef CONFIG_CPU_FREQ_DEFAULT_GOV_INTERACTIVE From 74433ad1d82631441cc062597d456643f2911823 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Fri, 15 Sep 2017 16:17:26 -0700 Subject: [PATCH 396/546] ANDROID: configs: remove config fragments The kernel config fragments for Android have moved into their own repository located at https://android.googlesource.com/kernel/configs/ Bug: 63994171 Change-Id: I837bac54cb5c90e6a6eb0f6f0ad5c90588c1a46a Signed-off-by: Steve Muckle --- android/configs/README | 15 --- android/configs/android-base-arm64.cfg | 5 - android/configs/android-base.cfg | 165 ------------------------ android/configs/android-recommended.cfg | 139 -------------------- 4 files changed, 324 deletions(-) delete mode 100644 android/configs/README delete mode 100644 android/configs/android-base-arm64.cfg delete mode 100644 android/configs/android-base.cfg delete mode 100644 android/configs/android-recommended.cfg diff --git a/android/configs/README b/android/configs/README deleted file mode 100644 index 8798731f8904..000000000000 --- a/android/configs/README +++ /dev/null @@ -1,15 +0,0 @@ -The files in this directory are meant to be used as a base for an Android -kernel config. All devices should have the options in android-base.cfg enabled. -While not mandatory, the options in android-recommended.cfg enable advanced -Android features. - -Assuming you already have a minimalist defconfig for your device, a possible -way to enable these options would be: - - ARCH= scripts/kconfig/merge_config.sh /_defconfig android/configs/android-base.cfg android/configs/android-recommended.cfg - -This will generate a .config that can then be used to save a new defconfig or -compile a new kernel with Android features enabled. - -Because there is no tool to consistently generate these config fragments, -lets keep them alphabetically sorted instead of random. diff --git a/android/configs/android-base-arm64.cfg b/android/configs/android-base-arm64.cfg deleted file mode 100644 index 43f23d6b5391..000000000000 --- a/android/configs/android-base-arm64.cfg +++ /dev/null @@ -1,5 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -CONFIG_ARMV8_DEPRECATED=y -CONFIG_CP15_BARRIER_EMULATION=y -CONFIG_SETEND_EMULATION=y -CONFIG_SWP_EMULATION=y diff --git a/android/configs/android-base.cfg b/android/configs/android-base.cfg deleted file mode 100644 index 7199561cd42b..000000000000 --- a/android/configs/android-base.cfg +++ /dev/null @@ -1,165 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_DEVKMEM is not set -# CONFIG_DEVMEM is not set -# CONFIG_FHANDLE is not set -# CONFIG_INET_LRO is not set -# CONFIG_NFSD is not set -# CONFIG_NFS_FS is not set -# CONFIG_OABI_COMPAT is not set -# CONFIG_SYSVIPC is not set -# CONFIG_USELIB is not set -CONFIG_ANDROID=y -CONFIG_ANDROID_BINDER_DEVICES=binder,hwbinder,vndbinder -CONFIG_ANDROID_BINDER_IPC=y -CONFIG_ANDROID_LOW_MEMORY_KILLER=y -CONFIG_ASHMEM=y -CONFIG_AUDIT=y -CONFIG_BLK_DEV_INITRD=y -CONFIG_CGROUPS=y -CONFIG_CGROUP_CPUACCT=y -CONFIG_CGROUP_FREEZER=y -CONFIG_CGROUP_SCHED=y -CONFIG_DEFAULT_SECURITY_SELINUX=y -CONFIG_EMBEDDED=y -CONFIG_FB=y -CONFIG_HARDENED_USERCOPY=y -CONFIG_HIGH_RES_TIMERS=y -CONFIG_IKCONFIG=y -CONFIG_IKCONFIG_PROC=y -CONFIG_INET6_AH=y -CONFIG_INET6_ESP=y -CONFIG_INET6_IPCOMP=y -CONFIG_INET=y -CONFIG_INET_DIAG_DESTROY=y -CONFIG_INET_ESP=y -CONFIG_INET_XFRM_MODE_TUNNEL=y -CONFIG_IP6_NF_FILTER=y -CONFIG_IP6_NF_IPTABLES=y -CONFIG_IP6_NF_MANGLE=y -CONFIG_IP6_NF_MATCH_RPFILTER=y -CONFIG_IP6_NF_RAW=y -CONFIG_IP6_NF_TARGET_REJECT=y -CONFIG_IPV6=y -CONFIG_IPV6_MIP6=y -CONFIG_IPV6_MULTIPLE_TABLES=y -CONFIG_IPV6_OPTIMISTIC_DAD=y -CONFIG_IPV6_ROUTER_PREF=y -CONFIG_IPV6_ROUTE_INFO=y -CONFIG_IP_ADVANCED_ROUTER=y -CONFIG_IP_MULTICAST=y -CONFIG_IP_MULTIPLE_TABLES=y -CONFIG_IP_NF_ARPFILTER=y -CONFIG_IP_NF_ARPTABLES=y -CONFIG_IP_NF_ARP_MANGLE=y -CONFIG_IP_NF_FILTER=y -CONFIG_IP_NF_IPTABLES=y -CONFIG_IP_NF_MANGLE=y -CONFIG_IP_NF_MATCH_AH=y -CONFIG_IP_NF_MATCH_ECN=y -CONFIG_IP_NF_MATCH_TTL=y -CONFIG_IP_NF_NAT=y -CONFIG_IP_NF_RAW=y -CONFIG_IP_NF_SECURITY=y -CONFIG_IP_NF_TARGET_MASQUERADE=y -CONFIG_IP_NF_TARGET_NETMAP=y -CONFIG_IP_NF_TARGET_REDIRECT=y -CONFIG_IP_NF_TARGET_REJECT=y -CONFIG_MODULES=y -CONFIG_MODULE_UNLOAD=y -CONFIG_MODVERSIONS=y -CONFIG_NET=y -CONFIG_NETDEVICES=y -CONFIG_NETFILTER=y -CONFIG_NETFILTER_XT_MATCH_COMMENT=y -CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=y -CONFIG_NETFILTER_XT_MATCH_CONNMARK=y -CONFIG_NETFILTER_XT_MATCH_CONNTRACK=y -CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=y -CONFIG_NETFILTER_XT_MATCH_HELPER=y -CONFIG_NETFILTER_XT_MATCH_IPRANGE=y -CONFIG_NETFILTER_XT_MATCH_LENGTH=y -CONFIG_NETFILTER_XT_MATCH_LIMIT=y -CONFIG_NETFILTER_XT_MATCH_MAC=y -CONFIG_NETFILTER_XT_MATCH_MARK=y -CONFIG_NETFILTER_XT_MATCH_PKTTYPE=y -CONFIG_NETFILTER_XT_MATCH_POLICY=y -CONFIG_NETFILTER_XT_MATCH_QTAGUID=y -CONFIG_NETFILTER_XT_MATCH_QUOTA2=y -CONFIG_NETFILTER_XT_MATCH_QUOTA=y -CONFIG_NETFILTER_XT_MATCH_SOCKET=y -CONFIG_NETFILTER_XT_MATCH_STATE=y -CONFIG_NETFILTER_XT_MATCH_STATISTIC=y -CONFIG_NETFILTER_XT_MATCH_STRING=y -CONFIG_NETFILTER_XT_MATCH_TIME=y -CONFIG_NETFILTER_XT_MATCH_U32=y -CONFIG_NETFILTER_XT_TARGET_CLASSIFY=y -CONFIG_NETFILTER_XT_TARGET_CONNMARK=y -CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=y -CONFIG_NETFILTER_XT_TARGET_IDLETIMER=y -CONFIG_NETFILTER_XT_TARGET_MARK=y -CONFIG_NETFILTER_XT_TARGET_NFLOG=y -CONFIG_NETFILTER_XT_TARGET_NFQUEUE=y -CONFIG_NETFILTER_XT_TARGET_SECMARK=y -CONFIG_NETFILTER_XT_TARGET_TCPMSS=y -CONFIG_NETFILTER_XT_TARGET_TPROXY=y -CONFIG_NETFILTER_XT_TARGET_TRACE=y -CONFIG_NET_CLS_ACT=y -CONFIG_NET_CLS_U32=y -CONFIG_NET_EMATCH=y -CONFIG_NET_EMATCH_U32=y -CONFIG_NET_KEY=y -CONFIG_NET_SCHED=y -CONFIG_NET_SCH_HTB=y -CONFIG_NF_CONNTRACK=y -CONFIG_NF_CONNTRACK_AMANDA=y -CONFIG_NF_CONNTRACK_EVENTS=y -CONFIG_NF_CONNTRACK_FTP=y -CONFIG_NF_CONNTRACK_H323=y -CONFIG_NF_CONNTRACK_IPV4=y -CONFIG_NF_CONNTRACK_IPV6=y -CONFIG_NF_CONNTRACK_IRC=y -CONFIG_NF_CONNTRACK_NETBIOS_NS=y -CONFIG_NF_CONNTRACK_PPTP=y -CONFIG_NF_CONNTRACK_SANE=y -CONFIG_NF_CONNTRACK_SECMARK=y -CONFIG_NF_CONNTRACK_TFTP=y -CONFIG_NF_CT_NETLINK=y -CONFIG_NF_CT_PROTO_DCCP=y -CONFIG_NF_CT_PROTO_SCTP=y -CONFIG_NF_CT_PROTO_UDPLITE=y -CONFIG_NF_NAT=y -CONFIG_NO_HZ=y -CONFIG_PACKET=y -CONFIG_PM_AUTOSLEEP=y -CONFIG_PM_WAKELOCKS=y -CONFIG_PPP=y -CONFIG_PPPOLAC=y -CONFIG_PPPOPNS=y -CONFIG_PPP_BSDCOMP=y -CONFIG_PPP_DEFLATE=y -CONFIG_PPP_MPPE=y -CONFIG_PREEMPT=y -CONFIG_PROFILING=y -CONFIG_RANDOMIZE_BASE=y -CONFIG_RTC_CLASS=y -CONFIG_RT_GROUP_SCHED=y -CONFIG_SECCOMP=y -CONFIG_SECURITY=y -CONFIG_SECURITY_NETWORK=y -CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y -CONFIG_SECURITY_SELINUX=y -CONFIG_STAGING=y -CONFIG_SYNC=y -CONFIG_TUN=y -CONFIG_UID_SYS_STATS=y -CONFIG_UNIX=y -CONFIG_USB_CONFIGFS=y -CONFIG_USB_CONFIGFS_F_ACC=y -CONFIG_USB_CONFIGFS_F_AUDIO_SRC=y -CONFIG_USB_CONFIGFS_F_FS=y -CONFIG_USB_CONFIGFS_F_MIDI=y -CONFIG_USB_CONFIGFS_F_MTP=y -CONFIG_USB_CONFIGFS_F_PTP=y -CONFIG_USB_CONFIGFS_UEVENT=y -CONFIG_USB_GADGET=y -CONFIG_XFRM_USER=y diff --git a/android/configs/android-recommended.cfg b/android/configs/android-recommended.cfg deleted file mode 100644 index 3d7e5e168940..000000000000 --- a/android/configs/android-recommended.cfg +++ /dev/null @@ -1,139 +0,0 @@ -# KEEP ALPHABETICALLY SORTED -# CONFIG_AIO is not set -# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set -# CONFIG_INPUT_MOUSE is not set -# CONFIG_LEGACY_PTYS is not set -# CONFIG_NF_CONNTRACK_SIP is not set -# CONFIG_PM_WAKELOCKS_GC is not set -# CONFIG_VT is not set -CONFIG_ANDROID_TIMED_GPIO=y -CONFIG_ARM64_SW_TTBR0_PAN=y -CONFIG_ARM_KERNMEM_PERMS=y -CONFIG_BACKLIGHT_LCD_SUPPORT=y -CONFIG_BLK_DEV_DM=y -CONFIG_BLK_DEV_LOOP=y -CONFIG_BLK_DEV_RAM=y -CONFIG_BLK_DEV_RAM_SIZE=8192 -CONFIG_CC_STACKPROTECTOR_STRONG=y -CONFIG_COMPACTION=y -CONFIG_CPU_SW_DOMAIN_PAN=y -CONFIG_DEBUG_RODATA=y -CONFIG_DM_CRYPT=y -CONFIG_DM_UEVENT=y -CONFIG_DM_VERITY=y -CONFIG_DM_VERITY_FEC=y -CONFIG_DRAGONRISE_FF=y -CONFIG_ENABLE_DEFAULT_TRACERS=y -CONFIG_EXT4_FS=y -CONFIG_EXT4_FS_SECURITY=y -CONFIG_FUSE_FS=y -CONFIG_GREENASIA_FF=y -CONFIG_HIDRAW=y -CONFIG_HID_A4TECH=y -CONFIG_HID_ACRUX=y -CONFIG_HID_ACRUX_FF=y -CONFIG_HID_APPLE=y -CONFIG_HID_BELKIN=y -CONFIG_HID_CHERRY=y -CONFIG_HID_CHICONY=y -CONFIG_HID_CYPRESS=y -CONFIG_HID_DRAGONRISE=y -CONFIG_HID_ELECOM=y -CONFIG_HID_EMS_FF=y -CONFIG_HID_EZKEY=y -CONFIG_HID_GREENASIA=y -CONFIG_HID_GYRATION=y -CONFIG_HID_HOLTEK=y -CONFIG_HID_KENSINGTON=y -CONFIG_HID_KEYTOUCH=y -CONFIG_HID_KYE=y -CONFIG_HID_LCPOWER=y -CONFIG_HID_LOGITECH=y -CONFIG_HID_LOGITECH_DJ=y -CONFIG_HID_MAGICMOUSE=y -CONFIG_HID_MICROSOFT=y -CONFIG_HID_MONTEREY=y -CONFIG_HID_MULTITOUCH=y -CONFIG_HID_NTRIG=y -CONFIG_HID_ORTEK=y -CONFIG_HID_PANTHERLORD=y -CONFIG_HID_PETALYNX=y -CONFIG_HID_PICOLCD=y -CONFIG_HID_PRIMAX=y -CONFIG_HID_PRODIKEYS=y -CONFIG_HID_ROCCAT=y -CONFIG_HID_SAITEK=y -CONFIG_HID_SAMSUNG=y -CONFIG_HID_SMARTJOYPLUS=y -CONFIG_HID_SONY=y -CONFIG_HID_SPEEDLINK=y -CONFIG_HID_SUNPLUS=y -CONFIG_HID_THRUSTMASTER=y -CONFIG_HID_TIVO=y -CONFIG_HID_TOPSEED=y -CONFIG_HID_TWINHAN=y -CONFIG_HID_UCLOGIC=y -CONFIG_HID_WACOM=y -CONFIG_HID_WALTOP=y -CONFIG_HID_WIIMOTE=y -CONFIG_HID_ZEROPLUS=y -CONFIG_HID_ZYDACRON=y -CONFIG_INPUT_EVDEV=y -CONFIG_INPUT_GPIO=y -CONFIG_INPUT_JOYSTICK=y -CONFIG_INPUT_KEYCHORD=y -CONFIG_INPUT_KEYRESET=y -CONFIG_INPUT_MISC=y -CONFIG_INPUT_TABLET=y -CONFIG_INPUT_UINPUT=y -CONFIG_ION=y -CONFIG_JOYSTICK_XPAD=y -CONFIG_JOYSTICK_XPAD_FF=y -CONFIG_JOYSTICK_XPAD_LEDS=y -CONFIG_KALLSYMS_ALL=y -CONFIG_KSM=y -CONFIG_LOGIG940_FF=y -CONFIG_LOGIRUMBLEPAD2_FF=y -CONFIG_LOGITECH_FF=y -CONFIG_MD=y -CONFIG_MEDIA_SUPPORT=y -CONFIG_MEMORY_STATE_TIME=y -CONFIG_MSDOS_FS=y -CONFIG_PANIC_TIMEOUT=5 -CONFIG_PANTHERLORD_FF=y -CONFIG_PERF_EVENTS=y -CONFIG_PM_DEBUG=y -CONFIG_PM_RUNTIME=y -CONFIG_PM_WAKELOCKS_LIMIT=0 -CONFIG_POWER_SUPPLY=y -CONFIG_PSTORE=y -CONFIG_PSTORE_CONSOLE=y -CONFIG_PSTORE_RAM=y -CONFIG_QFMT_V2=y -CONFIG_QUOTA=y -CONFIG_QUOTACTL=y -CONFIG_QUOTA_NETLINK_INTERFACE=y -CONFIG_QUOTA_TREE=y -CONFIG_SCHEDSTATS=y -CONFIG_SMARTJOYPLUS_FF=y -CONFIG_SND=y -CONFIG_SOUND=y -CONFIG_SUSPEND_TIME=y -CONFIG_TABLET_USB_ACECAD=y -CONFIG_TABLET_USB_AIPTEK=y -CONFIG_TABLET_USB_GTCO=y -CONFIG_TABLET_USB_HANWANG=y -CONFIG_TABLET_USB_KBTAB=y -CONFIG_TASKSTATS=y -CONFIG_TASK_DELAY_ACCT=y -CONFIG_TASK_IO_ACCOUNTING=y -CONFIG_TASK_XACCT=y -CONFIG_TIMER_STATS=y -CONFIG_TMPFS=y -CONFIG_TMPFS_POSIX_ACL=y -CONFIG_UHID=y -CONFIG_USB_ANNOUNCE_NEW_DEVICES=y -CONFIG_USB_EHCI_HCD=y -CONFIG_USB_HIDDEV=y -CONFIG_USB_USBNET=y -CONFIG_VFAT_FS=y From 268ebc9cae4c891b79292661c7463c64584353d4 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 8 Sep 2017 02:09:26 -0400 Subject: [PATCH 397/546] FROMLIST: android: binder: Drop lru lock in isolate callback MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9945123/) Drop the global lru lock in isolate callback before calling zap_page_range which calls cond_resched, and re-acquire the global lru lock before returning. Also change return code to LRU_REMOVED_RETRY. Use mmput_async when fail to acquire mmap sem in an atomic context. Fix "BUG: sleeping function called from invalid context" errors when CONFIG_DEBUG_ATOMIC_SLEEP is enabled. Bug: 63926541 Change-Id: I45dbada421b715abed9a66d03d30ae2285671ca1 Fixes: f2517eb76f1f2 ("android: binder: Add global lru shrinker to binder") Reported-by: Kyle Yan Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 1c1a7d9c86ed..9839afa84dfd 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -913,6 +913,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, struct binder_alloc *alloc; uintptr_t page_addr; size_t index; + struct vm_area_struct *vma; alloc = page->alloc; if (!mutex_trylock(&alloc->mutex)) @@ -923,16 +924,22 @@ enum lru_status binder_alloc_free_page(struct list_head *item, index = page - alloc->pages; page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; - if (alloc->vma) { + vma = alloc->vma; + if (vma) { mm = get_task_mm(alloc->tsk); if (!mm) goto err_get_task_mm_failed; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; + } + list_lru_isolate(lru, item); + spin_unlock(lock); + + if (vma) { trace_binder_unmap_user_start(alloc, index); - zap_page_range(alloc->vma, + zap_page_range(vma, page_addr + alloc->user_buffer_offset, PAGE_SIZE, NULL); @@ -951,13 +958,12 @@ enum lru_status binder_alloc_free_page(struct list_head *item, trace_binder_unmap_kernel_end(alloc, index); - list_lru_isolate(lru, item); - + spin_lock(lock); mutex_unlock(&alloc->mutex); - return LRU_REMOVED; + return LRU_REMOVED_RETRY; err_down_write_mmap_sem_failed: - mmput(mm); + mmput_async(mm); err_get_task_mm_failed: err_page_already_freed: mutex_unlock(&alloc->mutex); From e7edd94d111d3b4eb63011c6f40b88a0f06c171c Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 15 Sep 2017 20:40:03 -0400 Subject: [PATCH 398/546] FROMLIST: android: binder: Remove unused vma argument MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9954123/) The vma argument in update_page_range is no longer used after 74310e06 ("android: binder: Move buffer out of area shared with user space"), since mmap_handler no longer calls update_page_range with a vma. Test: ran binderLibTest, throughputtest, interfacetest and mempressure Bug: 36007193 Change-Id: Ibd6f24c11750f8f7e6ed56e40dd18c08e02ace25 Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index 9839afa84dfd..cfc3f5664015 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -186,12 +186,12 @@ struct binder_buffer *binder_alloc_prepare_to_free(struct binder_alloc *alloc, } static int binder_update_page_range(struct binder_alloc *alloc, int allocate, - void *start, void *end, - struct vm_area_struct *vma) + void *start, void *end) { void *page_addr; unsigned long user_page_addr; struct binder_lru_page *page; + struct vm_area_struct *vma = NULL; struct mm_struct *mm = NULL; bool need_mm = false; @@ -215,7 +215,7 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (!vma && need_mm) + if (need_mm) mm = get_task_mm(alloc->tsk); if (mm) { @@ -442,7 +442,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, if (end_page_addr > has_page_addr) end_page_addr = has_page_addr; ret = binder_update_page_range(alloc, 1, - (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr, NULL); + (void *)PAGE_ALIGN((uintptr_t)buffer->data), end_page_addr); if (ret) return ERR_PTR(ret); @@ -483,7 +483,7 @@ struct binder_buffer *binder_alloc_new_buf_locked(struct binder_alloc *alloc, err_alloc_buf_struct_failed: binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), - end_page_addr, NULL); + end_page_addr); return ERR_PTR(-ENOMEM); } @@ -567,8 +567,7 @@ static void binder_delete_free_buffer(struct binder_alloc *alloc, alloc->pid, buffer->data, prev->data, next->data); binder_update_page_range(alloc, 0, buffer_start_page(buffer), - buffer_start_page(buffer) + PAGE_SIZE, - NULL); + buffer_start_page(buffer) + PAGE_SIZE); } list_del(&buffer->entry); kfree(buffer); @@ -605,8 +604,7 @@ static void binder_free_buf_locked(struct binder_alloc *alloc, binder_update_page_range(alloc, 0, (void *)PAGE_ALIGN((uintptr_t)buffer->data), - (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK), - NULL); + (void *)(((uintptr_t)buffer->data + buffer_size) & PAGE_MASK)); rb_erase(&buffer->rb_node, &alloc->allocated_buffers); buffer->free = 1; From a455dfd674dd89ab8221f64969c80eca23eb8248 Mon Sep 17 00:00:00 2001 From: Sherry Yang Date: Fri, 15 Sep 2017 21:12:15 -0400 Subject: [PATCH 399/546] FROMLIST: android: binder: Don't get mm from task MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (from https://patchwork.kernel.org/patch/9954125/) Use binder_alloc struct's mm_struct rather than getting a reference to the mm struct through get_task_mm to avoid a potential deadlock between lru lock, task lock and dentry lock, since a thread can be holding the task lock and the dentry lock while trying to acquire the lru lock. Test: ran binderLibTest, throughputtest, interfacetest and mempressure w/lockdep Bug: 63926541 Change-Id: Icc661404eb7a4a2ecc5234b1bf8f0104665f9b45 Acked-by: Arve Hjønnevåg Signed-off-by: Sherry Yang --- drivers/android/binder_alloc.c | 25 ++++++++++++------------- drivers/android/binder_alloc.h | 1 - 2 files changed, 12 insertions(+), 14 deletions(-) diff --git a/drivers/android/binder_alloc.c b/drivers/android/binder_alloc.c index cfc3f5664015..b95da16fd938 100644 --- a/drivers/android/binder_alloc.c +++ b/drivers/android/binder_alloc.c @@ -215,17 +215,13 @@ static int binder_update_page_range(struct binder_alloc *alloc, int allocate, } } - if (need_mm) - mm = get_task_mm(alloc->tsk); + /* Same as mmget_not_zero() in later kernel versions */ + if (need_mm && atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users)) + mm = alloc->vma_vm_mm; if (mm) { down_write(&mm->mmap_sem); vma = alloc->vma; - if (vma && mm != alloc->vma_vm_mm) { - pr_err("%d: vma mm and task mm mismatch\n", - alloc->pid); - vma = NULL; - } } if (!vma && need_mm) { @@ -718,6 +714,8 @@ int binder_alloc_mmap_handler(struct binder_alloc *alloc, barrier(); alloc->vma = vma; alloc->vma_vm_mm = vma->vm_mm; + /* Same as mmgrab() in later kernel versions */ + atomic_inc(&alloc->vma_vm_mm->mm_count); return 0; @@ -793,6 +791,8 @@ void binder_alloc_deferred_release(struct binder_alloc *alloc) vfree(alloc->buffer); } mutex_unlock(&alloc->mutex); + if (alloc->vma_vm_mm) + mmdrop(alloc->vma_vm_mm); binder_alloc_debug(BINDER_DEBUG_OPEN_CLOSE, "%s: %d buffers %d, pages %d\n", @@ -887,7 +887,6 @@ int binder_alloc_get_allocated_count(struct binder_alloc *alloc) void binder_alloc_vma_close(struct binder_alloc *alloc) { WRITE_ONCE(alloc->vma, NULL); - WRITE_ONCE(alloc->vma_vm_mm, NULL); } /** @@ -924,9 +923,10 @@ enum lru_status binder_alloc_free_page(struct list_head *item, page_addr = (uintptr_t)alloc->buffer + index * PAGE_SIZE; vma = alloc->vma; if (vma) { - mm = get_task_mm(alloc->tsk); - if (!mm) - goto err_get_task_mm_failed; + /* Same as mmget_not_zero() in later kernel versions */ + if (!atomic_inc_not_zero(&alloc->vma_vm_mm->mm_users)) + goto err_mmget; + mm = alloc->vma_vm_mm; if (!down_write_trylock(&mm->mmap_sem)) goto err_down_write_mmap_sem_failed; } @@ -962,7 +962,7 @@ enum lru_status binder_alloc_free_page(struct list_head *item, err_down_write_mmap_sem_failed: mmput_async(mm); -err_get_task_mm_failed: +err_mmget: err_page_already_freed: mutex_unlock(&alloc->mutex); err_get_alloc_mutex_failed: @@ -1001,7 +1001,6 @@ struct shrinker binder_shrinker = { */ void binder_alloc_init(struct binder_alloc *alloc) { - alloc->tsk = current->group_leader; alloc->pid = current->group_leader->pid; mutex_init(&alloc->mutex); INIT_LIST_HEAD(&alloc->buffers); diff --git a/drivers/android/binder_alloc.h b/drivers/android/binder_alloc.h index a3a3602c689c..2dd33b6df104 100644 --- a/drivers/android/binder_alloc.h +++ b/drivers/android/binder_alloc.h @@ -100,7 +100,6 @@ struct binder_lru_page { */ struct binder_alloc { struct mutex mutex; - struct task_struct *tsk; struct vm_area_struct *vma; struct mm_struct *vma_vm_mm; void *buffer; From 37c7a3c87649b2e4702de7aca97cb8e09cc91063 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Fri, 20 May 2016 16:57:21 -0700 Subject: [PATCH 400/546] BACKPORT: partial: mm, oom_reaper: do not mmput synchronously from the oom reaper context MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit (cherry picked from commit ec8d7c14ea14922fe21945b458a75e39f11dd832) Tetsuo has properly noted that mmput slow path might get blocked waiting for another party (e.g. exit_aio waits for an IO). If that happens the oom_reaper would be put out of the way and will not be able to process next oom victim. We should strive for making this context as reliable and independent on other subsystems as much as possible. Introduce mmput_async which will perform the slow path from an async (WQ) context. This will delay the operation but that shouldn't be a problem because the oom_reaper has reclaimed the victim's address space for most cases as much as possible and the remaining context shouldn't bind too much memory anymore. The only exception is when mmap_sem trylock has failed which shouldn't happen too often. The issue is only theoretical but not impossible. Signed-off-by: Michal Hocko Reported-by: Tetsuo Handa Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds Only backports mmput_async. Change-Id: I5fe54abcc629e7d9eab9fe03908903d1174177f1 Signed-off-by: Arve Hjønnevåg --- include/linux/mm_types.h | 2 ++ include/linux/sched.h | 5 ++++ kernel/fork.c | 52 +++++++++++++++++++++++++++------------- 3 files changed, 43 insertions(+), 16 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index b32cb7add09c..907f94428ccc 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -12,6 +12,7 @@ #include #include #include +#include #include #include @@ -523,6 +524,7 @@ struct mm_struct { #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif + struct work_struct async_put_work; }; static inline void mm_init_cpumask(struct mm_struct *mm) diff --git a/include/linux/sched.h b/include/linux/sched.h index a6800fb3f4ea..d0d0b1b45418 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -2764,6 +2764,11 @@ static inline void mmdrop(struct mm_struct * mm) /* mmput gets rid of the mappings and all user-space */ extern void mmput(struct mm_struct *); +/* same as above but performs the slow path from the async kontext. Can + * be called from the atomic context as well + */ +extern void mmput_async(struct mm_struct *); + /* Grab a reference to a task's mm, if it is not already going away */ extern struct mm_struct *get_task_mm(struct task_struct *task); /* diff --git a/kernel/fork.c b/kernel/fork.c index f98b3360397c..2633d5f61d3c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -695,6 +695,26 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +static inline void __mmput(struct mm_struct *mm) +{ + VM_BUG_ON(atomic_read(&mm->mm_users)); + + uprobe_clear_state(mm); + exit_aio(mm); + ksm_exit(mm); + khugepaged_exit(mm); /* must run before exit_mmap */ + exit_mmap(mm); + set_mm_exe_file(mm, NULL); + if (!list_empty(&mm->mmlist)) { + spin_lock(&mmlist_lock); + list_del(&mm->mmlist); + spin_unlock(&mmlist_lock); + } + if (mm->binfmt) + module_put(mm->binfmt->module); + mmdrop(mm); +} + /* * Decrement the use count and release all resources for an mm. */ @@ -702,25 +722,25 @@ void mmput(struct mm_struct *mm) { might_sleep(); - if (atomic_dec_and_test(&mm->mm_users)) { - uprobe_clear_state(mm); - exit_aio(mm); - ksm_exit(mm); - khugepaged_exit(mm); /* must run before exit_mmap */ - exit_mmap(mm); - set_mm_exe_file(mm, NULL); - if (!list_empty(&mm->mmlist)) { - spin_lock(&mmlist_lock); - list_del(&mm->mmlist); - spin_unlock(&mmlist_lock); - } - if (mm->binfmt) - module_put(mm->binfmt->module); - mmdrop(mm); - } + if (atomic_dec_and_test(&mm->mm_users)) + __mmput(mm); } EXPORT_SYMBOL_GPL(mmput); +static void mmput_async_fn(struct work_struct *work) +{ + struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); + __mmput(mm); +} + +void mmput_async(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_users)) { + INIT_WORK(&mm->async_put_work, mmput_async_fn); + schedule_work(&mm->async_put_work); + } +} + /** * set_mm_exe_file - change a reference to the mm's executable file * From 060add8804d93e20794ee5f0f604b77943baa7f9 Mon Sep 17 00:00:00 2001 From: Vikram Mulukutla Date: Thu, 21 Sep 2017 17:24:24 -0700 Subject: [PATCH 401/546] Revert "ANDROID: sched/tune: Initialize raw_spin_lock in boosted_groups" This reverts commit c5616f2f874faa20b59b116177b99bf3948586df. If we re-init the per-cpu boostgroup spinlock every time that we add a new boosted cgroup, we can easily wipe out (reinit) a spinlock struct while in a critical section. We should only be setting up the per-cpu boostgroup data, and the spin_lock initialization need only happen once - which we're already doing in a postcore_initcall. For example: -------- CPU 0 -------- | -------- CPU1 -------- cgroupX boost group added | schedtune_enqueue_task | acquires(bg->lock) | cgroupY boost group added | for_each_cpu() | raw_spin_lock_init(bg->lock) releases(bg->lock) | BUG (already unlocked) | | This results in the following BUG from the debug spinlock code: BUG: spinlock already unlocked on CPU#5, rcuop/6/68 Change-Id: I3016702780b461a0cd95e26c538cd18df27d6316 Signed-off-by: Vikram Mulukutla --- kernel/sched/tune.c | 1 - 1 file changed, 1 deletion(-) diff --git a/kernel/sched/tune.c b/kernel/sched/tune.c index 86d04caf1684..d444fc1a4d58 100644 --- a/kernel/sched/tune.c +++ b/kernel/sched/tune.c @@ -647,7 +647,6 @@ schedtune_boostgroup_init(struct schedtune *st) bg = &per_cpu(cpu_boost_groups, cpu); bg->group[st->idx].boost = 0; bg->group[st->idx].tasks = 0; - raw_spin_lock_init(&bg->lock); } return 0; From 7d5c08fd91940f06f8c315f46823cda2ced27103 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 17:18:56 -0700 Subject: [PATCH 402/546] f2fs: backport from (4c1fad64 - Merge tag 'for-f2fs-4.9' of git://git.kernel.org/pub/scm/linux/kernel/git/jaegeuk/f2fs) Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 18 +- fs/Kconfig | 2 + fs/Makefile | 1 + fs/crypto/Kconfig | 18 + fs/crypto/Makefile | 3 + fs/crypto/crypto.c | 568 +++++++++ fs/{f2fs/crypto_fname.c => crypto/fname.c} | 278 ++-- fs/crypto/keyinfo.c | 304 +++++ fs/crypto/policy.c | 246 ++++ fs/f2fs/Kconfig | 21 +- fs/f2fs/Makefile | 2 - fs/f2fs/acl.c | 29 +- fs/f2fs/acl.h | 3 +- fs/f2fs/checkpoint.c | 529 +++++--- fs/f2fs/crypto.c | 491 -------- fs/f2fs/data.c | 1250 ++++++++++-------- fs/f2fs/debug.c | 80 +- fs/f2fs/dir.c | 477 +++---- fs/f2fs/extent_cache.c | 315 ++--- fs/f2fs/f2fs.h | 1001 +++++++++------ fs/f2fs/file.c | 1328 ++++++++++++++------ fs/f2fs/gc.c | 353 ++++-- fs/f2fs/gc.h | 8 - fs/f2fs/hash.c | 7 +- fs/f2fs/inline.c | 268 ++-- fs/f2fs/inode.c | 176 +-- fs/f2fs/namei.c | 409 +++--- fs/f2fs/node.c | 733 +++++++---- fs/f2fs/node.h | 123 +- fs/f2fs/recovery.c | 274 ++-- fs/f2fs/segment.c | 698 +++++----- fs/f2fs/segment.h | 47 +- fs/f2fs/shrinker.c | 8 +- fs/f2fs/super.c | 786 +++++++++--- fs/f2fs/trace.c | 6 +- fs/f2fs/xattr.c | 69 +- fs/f2fs/xattr.h | 3 +- include/linux/dcache.h | 1 + include/linux/f2fs_fs.h | 44 +- include/linux/fs.h | 7 + include/linux/fscrypto.h | 435 +++++++ include/trace/events/f2fs.h | 64 +- include/uapi/linux/fs.h | 18 + 43 files changed, 7560 insertions(+), 3941 deletions(-) create mode 100644 fs/crypto/Kconfig create mode 100644 fs/crypto/Makefile create mode 100644 fs/crypto/crypto.c rename fs/{f2fs/crypto_fname.c => crypto/fname.c} (53%) create mode 100644 fs/crypto/keyinfo.c create mode 100644 fs/crypto/policy.c delete mode 100644 fs/f2fs/crypto.c create mode 100644 include/linux/fscrypto.h diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index b102b436563e..753dd4f96afe 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -102,14 +102,16 @@ background_gc=%s Turn on/off cleaning operations, namely garbage collection, triggered in background when I/O subsystem is idle. If background_gc=on, it will turn on the garbage collection and if background_gc=off, garbage collection - will be truned off. If background_gc=sync, it will turn + will be turned off. If background_gc=sync, it will turn on synchronous garbage collection running in background. Default value for this option is on. So garbage collection is on by default. disable_roll_forward Disable the roll-forward recovery routine norecovery Disable the roll-forward recovery routine, mounted read- only (i.e., -o ro,disable_roll_forward) -discard Issue discard/TRIM commands when a segment is cleaned. +discard/nodiscard Enable/disable real-time discard in f2fs, if discard is + enabled, f2fs will issue discard/TRIM commands when a + segment is cleaned. no_heap Disable heap-style segment allocation which finds free segments for data from the beginning of main area, while for node from the end of main area. @@ -129,6 +131,7 @@ inline_dentry Enable the inline dir feature: data in new created directory entries can be written into inode block. The space of inode block which is used to store inline dentries is limited to ~3.4k. +noinline_dentry Diable the inline dentry feature. flush_merge Merge concurrent cache_flush commands as much as possible to eliminate redundant command issues. If the underlying device handles the cache_flush command relatively slowly, @@ -145,10 +148,15 @@ extent_cache Enable an extent cache based on rb-tree, it can cache as many as extent which map between contiguous logical address and physical address per inode, resulting in increasing the cache hit ratio. Set by default. -noextent_cache Diable an extent cache based on rb-tree explicitly, see +noextent_cache Disable an extent cache based on rb-tree explicitly, see the above extent_cache mount option. noinline_data Disable the inline data feature, inline data feature is enabled by default. +data_flush Enable data flushing before checkpoint in order to + persist data of regular and symlink. +mode=%s Control block allocation mode which supports "adaptive" + and "lfs". In "lfs" mode, there should be no random + writes towards main area. ================================================================================ DEBUGFS ENTRIES @@ -192,7 +200,7 @@ Files in /sys/fs/f2fs/ policy for garbage collection. Setting gc_idle = 0 (default) will disable this option. Setting gc_idle = 1 will select the Cost Benefit approach - & setting gc_idle = 2 will select the greedy aproach. + & setting gc_idle = 2 will select the greedy approach. reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree @@ -298,7 +306,7 @@ The dump.f2fs shows the information of specific inode and dumps SSA and SIT to file. Each file is dump_ssa and dump_sit. The dump.f2fs is used to debug on-disk data structures of the f2fs filesystem. -It shows on-disk inode information reconized by a given inode number, and is +It shows on-disk inode information recognized by a given inode number, and is able to dump all the SSA and SIT entries into predefined files, ./dump_ssa and ./dump_sit respectively. diff --git a/fs/Kconfig b/fs/Kconfig index a5d2dc39ba07..80af05163579 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -73,6 +73,8 @@ config FILE_LOCKING for filesystems like NFS and for the flock() system call. Disabling this option saves about 11k. +source "fs/crypto/Kconfig" + source "fs/notify/Kconfig" source "fs/quota/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index dee237540bc0..4644db462ba9 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -30,6 +30,7 @@ obj-$(CONFIG_EVENTFD) += eventfd.o obj-$(CONFIG_USERFAULTFD) += userfaultfd.o obj-$(CONFIG_AIO) += aio.o obj-$(CONFIG_FS_DAX) += dax.o +obj-$(CONFIG_FS_ENCRYPTION) += crypto/ obj-$(CONFIG_FILE_LOCKING) += locks.o obj-$(CONFIG_COMPAT) += compat.o compat_ioctl.o obj-$(CONFIG_BINFMT_AOUT) += binfmt_aout.o diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig new file mode 100644 index 000000000000..92348faf9865 --- /dev/null +++ b/fs/crypto/Kconfig @@ -0,0 +1,18 @@ +config FS_ENCRYPTION + tristate "FS Encryption (Per-file encryption)" + depends on BLOCK + select CRYPTO + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_ECB + select CRYPTO_XTS + select CRYPTO_CTS + select CRYPTO_CTR + select CRYPTO_SHA256 + select KEYS + select ENCRYPTED_KEYS + help + Enable encryption of files and directories. This + feature is similar to ecryptfs, but it is more memory + efficient since it avoids caching the encrypted and + decrypted pages in the page cache. diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile new file mode 100644 index 000000000000..f17684c48739 --- /dev/null +++ b/fs/crypto/Makefile @@ -0,0 +1,3 @@ +obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o + +fscrypto-y := crypto.o fname.o policy.o keyinfo.o diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c new file mode 100644 index 000000000000..2fc8c43ce531 --- /dev/null +++ b/fs/crypto/crypto.c @@ -0,0 +1,568 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +static unsigned int num_prealloc_crypto_pages = 32; +static unsigned int num_prealloc_crypto_ctxs = 128; + +module_param(num_prealloc_crypto_pages, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_pages, + "Number of crypto pages to preallocate"); +module_param(num_prealloc_crypto_ctxs, uint, 0444); +MODULE_PARM_DESC(num_prealloc_crypto_ctxs, + "Number of crypto contexts to preallocate"); + +static mempool_t *fscrypt_bounce_page_pool = NULL; + +static LIST_HEAD(fscrypt_free_ctxs); +static DEFINE_SPINLOCK(fscrypt_ctx_lock); + +static struct workqueue_struct *fscrypt_read_workqueue; +static DEFINE_MUTEX(fscrypt_init_mutex); + +static struct kmem_cache *fscrypt_ctx_cachep; +struct kmem_cache *fscrypt_info_cachep; + +/** + * fscrypt_release_ctx() - Releases an encryption context + * @ctx: The encryption context to release. + * + * If the encryption context was allocated from the pre-allocated pool, returns + * it to that pool. Else, frees it. + * + * If there's a bounce page in the context, this frees that. + */ +void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + unsigned long flags; + + if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); + ctx->w.bounce_page = NULL; + } + ctx->w.control_page = NULL; + if (ctx->flags & FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { + kmem_cache_free(fscrypt_ctx_cachep, ctx); + } else { + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + list_add(&ctx->free_list, &fscrypt_free_ctxs); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + } +} +EXPORT_SYMBOL(fscrypt_release_ctx); + +/** + * fscrypt_get_ctx() - Gets an encryption context + * @inode: The inode for which we are doing the crypto + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates and initializes an encryption context. + * + * Return: An allocated and initialized encryption context on success; error + * value or NULL otherwise. + */ +struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx = NULL; + struct fscrypt_info *ci = inode->i_crypt_info; + unsigned long flags; + + if (ci == NULL) + return ERR_PTR(-ENOKEY); + + /* + * We first try getting the ctx from a free list because in + * the common case the ctx will have an allocated and + * initialized crypto tfm, so it's probably a worthwhile + * optimization. For the bounce page, we first try getting it + * from the kernel allocator because that's just about as fast + * as getting it from a list and because a cache of free pages + * should generally be a "last resort" option for a filesystem + * to be able to do its job. + */ + spin_lock_irqsave(&fscrypt_ctx_lock, flags); + ctx = list_first_entry_or_null(&fscrypt_free_ctxs, + struct fscrypt_ctx, free_list); + if (ctx) + list_del(&ctx->free_list); + spin_unlock_irqrestore(&fscrypt_ctx_lock, flags); + if (!ctx) { + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, gfp_flags); + if (!ctx) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } else { + ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; + } + ctx->flags &= ~FS_WRITE_PATH_FL; + return ctx; +} +EXPORT_SYMBOL(fscrypt_get_ctx); + +/** + * fscrypt_complete() - The completion callback for page encryption + * @req: The asynchronous encryption request context + * @res: The result of the encryption operation + */ +static void fscrypt_complete(struct crypto_async_request *req, int res) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (res == -EINPROGRESS) + return; + ecr->res = res; + complete(&ecr->completion); +} + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +static int do_page_crypto(struct inode *inode, + fscrypt_direction_t rw, pgoff_t index, + struct page *src_page, struct page *dest_page, + gfp_t gfp_flags) +{ + u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist dst, src; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; + int res = 0; + + req = skcipher_request_alloc(tfm, gfp_flags); + if (!req) { + printk_ratelimited(KERN_ERR + "%s: crypto_request_alloc() failed\n", + __func__); + return -ENOMEM; + } + + skcipher_request_set_callback( + req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + fscrypt_complete, &ecr); + + BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); + memcpy(xts_tweak, &index, sizeof(index)); + memset(&xts_tweak[sizeof(index)], 0, + FS_XTS_TWEAK_SIZE - sizeof(index)); + + sg_init_table(&dst, 1); + sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_init_table(&src, 1); + sg_set_page(&src, src_page, PAGE_SIZE, 0); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, + xts_tweak); + if (rw == FS_DECRYPT) + res = crypto_skcipher_decrypt(req); + else + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + BUG_ON(req->base.data != &ecr); + wait_for_completion(&ecr.completion); + res = ecr.res; + } + skcipher_request_free(req); + if (res) { + printk_ratelimited(KERN_ERR + "%s: crypto_skcipher_encrypt() returned %d\n", + __func__, res); + return res; + } + return 0; +} + +static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +{ + ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); + if (ctx->w.bounce_page == NULL) + return ERR_PTR(-ENOMEM); + ctx->flags |= FS_WRITE_PATH_FL; + return ctx->w.bounce_page; +} + +/** + * fscypt_encrypt_page() - Encrypts a page + * @inode: The inode for which the encryption should take place + * @plaintext_page: The page to encrypt. Must be locked. + * @gfp_flags: The gfp flag for memory allocation + * + * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx + * encryption context. + * + * Called on the page write path. The caller must call + * fscrypt_restore_control_page() on the returned ciphertext page to + * release the bounce buffer and the encryption context. + * + * Return: An allocated page with the encrypted content on success. Else, an + * error value or NULL. + */ +struct page *fscrypt_encrypt_page(struct inode *inode, + struct page *plaintext_page, gfp_t gfp_flags) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + int err; + + BUG_ON(!PageLocked(plaintext_page)); + + ctx = fscrypt_get_ctx(inode, gfp_flags); + if (IS_ERR(ctx)) + return (struct page *)ctx; + + /* The encryption operation will require a bounce page. */ + ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + if (IS_ERR(ciphertext_page)) + goto errout; + + ctx->w.control_page = plaintext_page; + err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, + plaintext_page, ciphertext_page, + gfp_flags); + if (err) { + ciphertext_page = ERR_PTR(err); + goto errout; + } + SetPagePrivate(ciphertext_page); + set_page_private(ciphertext_page, (unsigned long)ctx); + lock_page(ciphertext_page); + return ciphertext_page; + +errout: + fscrypt_release_ctx(ctx); + return ciphertext_page; +} +EXPORT_SYMBOL(fscrypt_encrypt_page); + +/** + * f2crypt_decrypt_page() - Decrypts a page in-place + * @page: The page to decrypt. Must be locked. + * + * Decrypts page in-place using the ctx encryption context. + * + * Called from the read completion callback. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_decrypt_page(struct page *page) +{ + BUG_ON(!PageLocked(page)); + + return do_page_crypto(page->mapping->host, + FS_DECRYPT, page->index, page, page, GFP_NOFS); +} +EXPORT_SYMBOL(fscrypt_decrypt_page); + +int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(WRITE, bio); + if ((err == 0) && bio->bi_error) + err = -EIO; + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); + +/* + * Validate dentries for encrypted directories to make sure we aren't + * potentially caching stale data after a key has been added or + * removed. + */ +static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) +{ + struct dentry *dir; + struct fscrypt_info *ci; + int dir_has_key, cached_with_key; + + if (flags & LOOKUP_RCU) + return -ECHILD; + + dir = dget_parent(dentry); + if (!d_inode(dir)->i_sb->s_cop->is_encrypted(d_inode(dir))) { + dput(dir); + return 0; + } + + ci = d_inode(dir)->i_crypt_info; + if (ci && ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD)))) + ci = NULL; + + /* this should eventually be an flag in d_flags */ + spin_lock(&dentry->d_lock); + cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); + dir_has_key = (ci != NULL); + dput(dir); + + /* + * If the dentry was cached without the key, and it is a + * negative dentry, it might be a valid name. We can't check + * if the key has since been made available due to locking + * reasons, so we fail the validation so ext4_lookup() can do + * this check. + * + * We also fail the validation if the dentry was created with + * the key present, but we no longer have the key, or vice versa. + */ + if ((!cached_with_key && d_is_negative(dentry)) || + (!cached_with_key && dir_has_key) || + (cached_with_key && !dir_has_key)) + return 0; + return 1; +} + +const struct dentry_operations fscrypt_d_ops = { + .d_revalidate = fscrypt_d_revalidate, +}; +EXPORT_SYMBOL(fscrypt_d_ops); + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +void fscrypt_restore_control_page(struct page *page) +{ + struct fscrypt_ctx *ctx; + + ctx = (struct fscrypt_ctx *)page_private(page); + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + fscrypt_release_ctx(ctx); +} +EXPORT_SYMBOL(fscrypt_restore_control_page); + +static void fscrypt_destroy(void) +{ + struct fscrypt_ctx *pos, *n; + + list_for_each_entry_safe(pos, n, &fscrypt_free_ctxs, free_list) + kmem_cache_free(fscrypt_ctx_cachep, pos); + INIT_LIST_HEAD(&fscrypt_free_ctxs); + mempool_destroy(fscrypt_bounce_page_pool); + fscrypt_bounce_page_pool = NULL; +} + +/** + * fscrypt_initialize() - allocate major buffers for fs encryption. + * + * We only call this when we start accessing encrypted files, since it + * results in memory getting allocated that wouldn't otherwise be used. + * + * Return: Zero on success, non-zero otherwise. + */ +int fscrypt_initialize(void) +{ + int i, res = -ENOMEM; + + if (fscrypt_bounce_page_pool) + return 0; + + mutex_lock(&fscrypt_init_mutex); + if (fscrypt_bounce_page_pool) + goto already_initialized; + + for (i = 0; i < num_prealloc_crypto_ctxs; i++) { + struct fscrypt_ctx *ctx; + + ctx = kmem_cache_zalloc(fscrypt_ctx_cachep, GFP_NOFS); + if (!ctx) + goto fail; + list_add(&ctx->free_list, &fscrypt_free_ctxs); + } + + fscrypt_bounce_page_pool = + mempool_create_page_pool(num_prealloc_crypto_pages, 0); + if (!fscrypt_bounce_page_pool) + goto fail; + +already_initialized: + mutex_unlock(&fscrypt_init_mutex); + return 0; +fail: + fscrypt_destroy(); + mutex_unlock(&fscrypt_init_mutex); + return res; +} +EXPORT_SYMBOL(fscrypt_initialize); + +/** + * fscrypt_init() - Set up for fs encryption. + */ +static int __init fscrypt_init(void) +{ + fscrypt_read_workqueue = alloc_workqueue("fscrypt_read_queue", + WQ_HIGHPRI, 0); + if (!fscrypt_read_workqueue) + goto fail; + + fscrypt_ctx_cachep = KMEM_CACHE(fscrypt_ctx, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_ctx_cachep) + goto fail_free_queue; + + fscrypt_info_cachep = KMEM_CACHE(fscrypt_info, SLAB_RECLAIM_ACCOUNT); + if (!fscrypt_info_cachep) + goto fail_free_ctx; + + return 0; + +fail_free_ctx: + kmem_cache_destroy(fscrypt_ctx_cachep); +fail_free_queue: + destroy_workqueue(fscrypt_read_workqueue); +fail: + return -ENOMEM; +} +module_init(fscrypt_init) + +/** + * fscrypt_exit() - Shutdown the fs encryption system + */ +static void __exit fscrypt_exit(void) +{ + fscrypt_destroy(); + + if (fscrypt_read_workqueue) + destroy_workqueue(fscrypt_read_workqueue); + kmem_cache_destroy(fscrypt_ctx_cachep); + kmem_cache_destroy(fscrypt_info_cachep); +} +module_exit(fscrypt_exit); + +MODULE_LICENSE("GPL"); diff --git a/fs/f2fs/crypto_fname.c b/fs/crypto/fname.c similarity index 53% rename from fs/f2fs/crypto_fname.c rename to fs/crypto/fname.c index 38349ed5ea51..5d6d49113efa 100644 --- a/fs/f2fs/crypto_fname.c +++ b/fs/crypto/fname.c @@ -1,46 +1,32 @@ /* - * linux/fs/f2fs/crypto_fname.c - * - * Copied from linux/fs/ext4/crypto.c + * This contains functions for filename crypto management * * Copyright (C) 2015, Google, Inc. * Copyright (C) 2015, Motorola Mobility * - * This contains functions for filename crypto management in f2fs - * * Written by Uday Savagaonkar, 2014. - * - * Adjust f2fs dentry structure - * Jaegeuk Kim, 2015. + * Modified by Jaegeuk Kim, 2015. * * This has not yet undergone a rigorous security audit. */ -#include -#include + #include #include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include +#include -#include "f2fs.h" -#include "f2fs_crypto.h" -#include "xattr.h" +static u32 size_round_up(size_t size, size_t blksize) +{ + return ((size + blksize - 1) / blksize) * blksize; +} /** - * f2fs_dir_crypt_complete() - + * dir_crypt_complete() - */ -static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) +static void dir_crypt_complete(struct crypto_async_request *req, int res) { - struct f2fs_completion_result *ecr = req->data; + struct fscrypt_completion_result *ecr = req->data; if (res == -EINPROGRESS) return; @@ -48,45 +34,35 @@ static void f2fs_dir_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -bool f2fs_valid_filenames_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_CTS); -} - -static unsigned max_name_len(struct inode *inode) -{ - return S_ISLNK(inode->i_mode) ? inode->i_sb->s_blocksize : - F2FS_NAME_LEN; -} - /** - * f2fs_fname_encrypt() - + * fname_encrypt() - * * This function encrypts the input filename, and returns the length of the * ciphertext. Errors are returned as negative numbers. We trust the caller to * allocate sufficient memory to oname string. */ -static int f2fs_fname_encrypt(struct inode *inode, - const struct qstr *iname, struct f2fs_str *oname) +static int fname_encrypt(struct inode *inode, + const struct qstr *iname, struct fscrypt_str *oname) { u32 ciphertext_len; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; + char iv[FS_CRYPTO_BLOCK_SIZE]; struct scatterlist src_sg, dst_sg; - int padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); + int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim = max_name_len(inode); + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < F2FS_CRYPTO_BLOCK_SIZE) ? - F2FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = f2fs_fname_crypto_round_up(ciphertext_len, padding); + ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? + FS_CRYPTO_BLOCK_SIZE : iname->len; + ciphertext_len = size_round_up(ciphertext_len, padding); ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; if (ciphertext_len <= sizeof(buf)) { @@ -99,16 +75,16 @@ static int f2fs_fname_encrypt(struct inode *inode, } /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); kfree(alloc_buf); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Copy the input */ memcpy(workbuf, iname->name, iname->len); @@ -116,79 +92,78 @@ static int f2fs_fname_encrypt(struct inode *inode, memset(workbuf + iname->len, 0, ciphertext_len - iname->len); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create encryption request */ sg_init_one(&src_sg, workbuf, ciphertext_len); sg_init_one(&dst_sg, oname->name, ciphertext_len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); - res = crypto_ablkcipher_encrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } kfree(alloc_buf); - ablkcipher_request_free(req); - if (res < 0) { + skcipher_request_free(req); + if (res < 0) printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); - } + oname->len = ciphertext_len; return res; } /* - * f2fs_fname_decrypt() + * fname_decrypt() * This function decrypts the input filename, and returns * the length of the plaintext. * Errors are returned as negative numbers. * We trust the caller to allocate sufficient memory to oname string. */ -static int f2fs_fname_decrypt(struct inode *inode, - const struct f2fs_str *iname, struct f2fs_str *oname) +static int fname_decrypt(struct inode *inode, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist src_sg, dst_sg; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; + struct fscrypt_info *ci = inode->i_crypt_info; + struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; - char iv[F2FS_CRYPTO_BLOCK_SIZE]; - unsigned lim = max_name_len(inode); + char iv[FS_CRYPTO_BLOCK_SIZE]; + unsigned lim; + lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; /* Allocate request */ - req = ablkcipher_request_alloc(tfm, GFP_NOFS); + req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR "%s: crypto_request_alloc() failed\n", __func__); return -ENOMEM; } - ablkcipher_request_set_callback(req, + skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_dir_crypt_complete, &ecr); + dir_crypt_complete, &ecr); /* Initialize IV */ - memset(iv, 0, F2FS_CRYPTO_BLOCK_SIZE); + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); /* Create decryption request */ sg_init_one(&src_sg, iname->name, iname->len); sg_init_one(&dst_sg, oname->name, oname->len); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); - res = crypto_ablkcipher_decrypt(req); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, iname->len, iv); + res = crypto_skcipher_decrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); wait_for_completion(&ecr.completion); res = ecr.res; } - ablkcipher_request_free(req); + skcipher_request_free(req); if (res < 0) { printk_ratelimited(KERN_ERR - "%s: Error in f2fs_fname_decrypt (error code %d)\n", - __func__, res); + "%s: Error (error code %d)\n", __func__, res); return res; } @@ -200,7 +175,7 @@ static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; /** - * f2fs_fname_encode_digest() - + * digest_encode() - * * Encodes the input digest using characters from the set [a-zA-Z0-9_+]. * The encoded string is roughly 4/3 times the size of the input string. @@ -249,148 +224,152 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -/** - * f2fs_fname_crypto_round_up() - - * - * Return: The next multiple of block size - */ -u32 f2fs_fname_crypto_round_up(u32 size, u32 blksize) +u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) { - return ((size + blksize - 1) / blksize) * blksize; + int padding = 32; + struct fscrypt_info *ci = inode->i_crypt_info; + + if (ci) + padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); + if (ilen < FS_CRYPTO_BLOCK_SIZE) + ilen = FS_CRYPTO_BLOCK_SIZE; + return size_round_up(ilen, padding); } +EXPORT_SYMBOL(fscrypt_fname_encrypted_size); /** - * f2fs_fname_crypto_alloc_obuff() - + * fscrypt_fname_crypto_alloc_obuff() - * * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int f2fs_fname_crypto_alloc_buffer(struct inode *inode, - u32 ilen, struct f2fs_str *crypto_str) +int fscrypt_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen; - int padding = 16; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; + unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); - if (ci) - padding = 4 << (ci->ci_flags & F2FS_POLICY_FLAGS_PAD_MASK); - if (padding < F2FS_CRYPTO_BLOCK_SIZE) - padding = F2FS_CRYPTO_BLOCK_SIZE; - olen = f2fs_fname_crypto_round_up(ilen, padding); crypto_str->len = olen; - if (olen < F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = F2FS_FNAME_CRYPTO_DIGEST_SIZE * 2; - /* Allocated buffer can hold one more character to null-terminate the - * string */ + if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) + olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + /* + * Allocated buffer can hold one more character to null-terminate the + * string + */ crypto_str->name = kmalloc(olen + 1, GFP_NOFS); if (!(crypto_str->name)) return -ENOMEM; return 0; } +EXPORT_SYMBOL(fscrypt_fname_alloc_buffer); /** - * f2fs_fname_crypto_free_buffer() - + * fscrypt_fname_crypto_free_buffer() - * * Frees the buffer allocated for crypto operation. */ -void f2fs_fname_crypto_free_buffer(struct f2fs_str *crypto_str) +void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) { if (!crypto_str) return; kfree(crypto_str->name); crypto_str->name = NULL; } +EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** - * f2fs_fname_disk_to_usr() - converts a filename from disk space to user space + * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user + * space */ -int f2fs_fname_disk_to_usr(struct inode *inode, - f2fs_hash_t *hash, - const struct f2fs_str *iname, - struct f2fs_str *oname) +int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; int ret; - if (is_dot_dotdot(&qname)) { + if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - if (F2FS_I(inode)->i_crypt_info) - return f2fs_fname_decrypt(inode, iname, oname); + if (iname->len < FS_CRYPTO_BLOCK_SIZE) + return -EUCLEAN; - if (iname->len <= F2FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (inode->i_crypt_info) + return fname_decrypt(inode, iname, oname); + + if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { ret = digest_encode(iname->name, iname->len, oname->name); oname->len = ret; return ret; } if (hash) { - memcpy(buf, hash, 4); - memset(buf + 4, 0, 4); - } else + memcpy(buf, &hash, 4); + memcpy(buf + 4, &minor_hash, 4); + } else { memset(buf, 0, 8); - memcpy(buf + 8, iname->name + ((iname->len - 17) & ~15), 16); + } + memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; ret = digest_encode(buf, 24, oname->name + 1); oname->len = ret + 1; return ret + 1; } +EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** - * f2fs_fname_usr_to_disk() - converts a filename from user space to disk space + * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk + * space */ -int f2fs_fname_usr_to_disk(struct inode *inode, +int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, - struct f2fs_str *oname) + struct fscrypt_str *oname) { - int res; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (is_dot_dotdot(iname)) { + if (fscrypt_is_dot_dotdot(iname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; return oname->len; } - - if (ci) { - res = f2fs_fname_encrypt(inode, iname, oname); - return res; - } - /* Without a proper key, a user is not allowed to modify the filenames + if (inode->i_crypt_info) + return fname_encrypt(inode, iname, oname); + /* + * Without a proper key, a user is not allowed to modify the filenames * in a directory. Consequently, a user space name cannot be mapped to - * a disk-space name */ + * a disk-space name + */ return -EACCES; } +EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); -int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, - int lookup, struct f2fs_filename *fname) +int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, + int lookup, struct fscrypt_name *fname) { - struct f2fs_crypt_info *ci; int ret = 0, bigname = 0; - memset(fname, 0, sizeof(struct f2fs_filename)); + memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; - if (!f2fs_encrypted_inode(dir) || is_dot_dotdot(iname)) { + if (!dir->i_sb->s_cop->is_encrypted(dir) || + fscrypt_is_dot_dotdot(iname)) { fname->disk_name.name = (unsigned char *)iname->name; fname->disk_name.len = iname->len; return 0; } - ret = f2fs_get_encryption_info(dir); - if (ret) + ret = get_crypt_info(dir); + if (ret && ret != -EOPNOTSUPP) return ret; - ci = F2FS_I(dir)->i_crypt_info; - if (ci) { - ret = f2fs_fname_crypto_alloc_buffer(dir, iname->len, - &fname->crypto_buf); + + if (dir->i_crypt_info) { + ret = fscrypt_fname_alloc_buffer(dir, iname->len, + &fname->crypto_buf); if (ret < 0) return ret; - ret = f2fs_fname_encrypt(dir, iname, &fname->crypto_buf); + ret = fname_encrypt(dir, iname, &fname->crypto_buf); if (ret < 0) goto errout; fname->disk_name.name = fname->crypto_buf.name; @@ -400,18 +379,19 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, if (!lookup) return -EACCES; - /* We don't have the key and we are doing a lookup; decode the + /* + * We don't have the key and we are doing a lookup; decode the * user-supplied name */ if (iname->name[0] == '_') bigname = 1; - if ((bigname && (iname->len != 33)) || - (!bigname && (iname->len > 43))) + if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) return -ENOENT; fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; + ret = digest_decode(iname->name + bigname, iname->len - bigname, fname->crypto_buf.name); if (ret < 0) { @@ -421,20 +401,24 @@ int f2fs_fname_setup_filename(struct inode *dir, const struct qstr *iname, fname->crypto_buf.len = ret; if (bigname) { memcpy(&fname->hash, fname->crypto_buf.name, 4); + memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; } return 0; + errout: - f2fs_fname_crypto_free_buffer(&fname->crypto_buf); + fscrypt_fname_free_buffer(&fname->crypto_buf); return ret; } +EXPORT_SYMBOL(fscrypt_setup_filename); -void f2fs_fname_free_filename(struct f2fs_filename *fname) +void fscrypt_free_filename(struct fscrypt_name *fname) { kfree(fname->crypto_buf.name); fname->crypto_buf.name = NULL; fname->usr_fname = NULL; fname->disk_name.name = NULL; } +EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c new file mode 100644 index 000000000000..1ac263eddc4e --- /dev/null +++ b/fs/crypto/keyinfo.c @@ -0,0 +1,304 @@ +/* + * key management facility for FS encryption support. + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#include +#include +#include +#include +#include +#include + +static void derive_crypt_complete(struct crypto_async_request *req, int rc) +{ + struct fscrypt_completion_result *ecr = req->data; + + if (rc == -EINPROGRESS) + return; + + ecr->res = rc; + complete(&ecr->completion); +} + +/** + * derive_key_aes() - Derive a key using AES-128-ECB + * @deriving_key: Encryption key used for derivation. + * @source_key: Source key to which to apply derivation. + * @derived_key: Derived key. + * + * Return: Zero on success; non-zero otherwise. + */ +static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], + u8 source_key[FS_AES_256_XTS_KEY_SIZE], + u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) +{ + int res = 0; + struct skcipher_request *req = NULL; + DECLARE_FS_COMPLETION_RESULT(ecr); + struct scatterlist src_sg, dst_sg; + struct crypto_skcipher *tfm = crypto_alloc_skcipher("ecb(aes)", 0, 0); + + if (IS_ERR(tfm)) { + res = PTR_ERR(tfm); + tfm = NULL; + goto out; + } + crypto_skcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); + req = skcipher_request_alloc(tfm, GFP_NOFS); + if (!req) { + res = -ENOMEM; + goto out; + } + skcipher_request_set_callback(req, + CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, + derive_crypt_complete, &ecr); + res = crypto_skcipher_setkey(tfm, deriving_key, + FS_AES_128_ECB_KEY_SIZE); + if (res < 0) + goto out; + + sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); + sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, + FS_AES_256_XTS_KEY_SIZE, NULL); + res = crypto_skcipher_encrypt(req); + if (res == -EINPROGRESS || res == -EBUSY) { + wait_for_completion(&ecr.completion); + res = ecr.res; + } +out: + skcipher_request_free(req); + crypto_free_skcipher(tfm); + return res; +} + +static int validate_user_key(struct fscrypt_info *crypt_info, + struct fscrypt_context *ctx, u8 *raw_key, + u8 *prefix, int prefix_size) +{ + u8 *full_key_descriptor; + struct key *keyring_key; + struct fscrypt_key *master_key; + const struct user_key_payload *ukp; + int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; + int res; + + full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); + if (!full_key_descriptor) + return -ENOMEM; + + memcpy(full_key_descriptor, prefix, prefix_size); + sprintf(full_key_descriptor + prefix_size, + "%*phN", FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + full_key_descriptor[full_key_len - 1] = '\0'; + keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); + kfree(full_key_descriptor); + if (IS_ERR(keyring_key)) + return PTR_ERR(keyring_key); + + if (keyring_key->type != &key_type_logon) { + printk_once(KERN_WARNING + "%s: key type must be logon\n", __func__); + res = -ENOKEY; + goto out; + } + down_read(&keyring_key->sem); + ukp = user_key_payload(keyring_key); + if (ukp->datalen != sizeof(struct fscrypt_key)) { + res = -EINVAL; + up_read(&keyring_key->sem); + goto out; + } + master_key = (struct fscrypt_key *)ukp->data; + BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); + + if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + printk_once(KERN_WARNING + "%s: key size incorrect: %d\n", + __func__, master_key->size); + res = -ENOKEY; + up_read(&keyring_key->sem); + goto out; + } + res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); + if (res) + goto out; + + crypt_info->ci_keyring_key = keyring_key; + return 0; +out: + key_put(keyring_key); + return res; +} + +static void put_crypt_info(struct fscrypt_info *ci) +{ + if (!ci) + return; + + key_put(ci->ci_keyring_key); + crypto_free_skcipher(ci->ci_ctfm); + kmem_cache_free(fscrypt_info_cachep, ci); +} + +int get_crypt_info(struct inode *inode) +{ + struct fscrypt_info *crypt_info; + struct fscrypt_context ctx; + struct crypto_skcipher *ctfm; + const char *cipher_str; + u8 raw_key[FS_MAX_KEY_SIZE]; + u8 mode; + int res; + + res = fscrypt_initialize(); + if (res) + return res; + + if (!inode->i_sb->s_cop->get_context) + return -EOPNOTSUPP; +retry: + crypt_info = ACCESS_ONCE(inode->i_crypt_info); + if (crypt_info) { + if (!crypt_info->ci_keyring_key || + key_validate(crypt_info->ci_keyring_key) == 0) + return 0; + fscrypt_put_encryption_info(inode, crypt_info); + goto retry; + } + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0) { + if (!fscrypt_dummy_context_enabled(inode)) + return res; + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + } else if (res != sizeof(ctx)) { + return -EINVAL; + } + res = 0; + + crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); + if (!crypt_info) + return -ENOMEM; + + crypt_info->ci_flags = ctx.flags; + crypt_info->ci_data_mode = ctx.contents_encryption_mode; + crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; + crypt_info->ci_ctfm = NULL; + crypt_info->ci_keyring_key = NULL; + memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, + sizeof(crypt_info->ci_master_key)); + if (S_ISREG(inode->i_mode)) + mode = crypt_info->ci_data_mode; + else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) + mode = crypt_info->ci_filename_mode; + else + BUG(); + + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + cipher_str = "xts(aes)"; + break; + case FS_ENCRYPTION_MODE_AES_256_CTS: + cipher_str = "cts(cbc(aes))"; + break; + default: + printk_once(KERN_WARNING + "%s: unsupported key mode %d (ino %u)\n", + __func__, mode, (unsigned) inode->i_ino); + res = -ENOKEY; + goto out; + } + if (fscrypt_dummy_context_enabled(inode)) { + memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); + goto got_key; + } + + res = validate_user_key(crypt_info, &ctx, raw_key, + FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + if (res && inode->i_sb->s_cop->key_prefix) { + u8 *prefix = NULL; + int prefix_size, res2; + + prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); + res2 = validate_user_key(crypt_info, &ctx, raw_key, + prefix, prefix_size); + if (res2) { + if (res2 == -ENOKEY) + res = -ENOKEY; + goto out; + } + } else if (res) { + goto out; + } +got_key: + ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); + if (!ctfm || IS_ERR(ctfm)) { + res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; + printk(KERN_DEBUG + "%s: error %d (inode %u) allocating crypto tfm\n", + __func__, res, (unsigned) inode->i_ino); + goto out; + } + crypt_info->ci_ctfm = ctfm; + crypto_skcipher_clear_flags(ctfm, ~0); + crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + if (res) + goto out; + + memzero_explicit(raw_key, sizeof(raw_key)); + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { + put_crypt_info(crypt_info); + goto retry; + } + return 0; + +out: + if (res == -ENOKEY) + res = 0; + put_crypt_info(crypt_info); + memzero_explicit(raw_key, sizeof(raw_key)); + return res; +} + +void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) +{ + struct fscrypt_info *prev; + + if (ci == NULL) + ci = ACCESS_ONCE(inode->i_crypt_info); + if (ci == NULL) + return; + + prev = cmpxchg(&inode->i_crypt_info, ci, NULL); + if (prev != ci) + return; + + put_crypt_info(ci); +} +EXPORT_SYMBOL(fscrypt_put_encryption_info); + +int fscrypt_get_encryption_info(struct inode *inode) +{ + struct fscrypt_info *ci = inode->i_crypt_info; + + if (!ci || + (ci->ci_keyring_key && + (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | + (1 << KEY_FLAG_REVOKED) | + (1 << KEY_FLAG_DEAD))))) + return get_crypt_info(inode); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c new file mode 100644 index 000000000000..ed115acb5dee --- /dev/null +++ b/fs/crypto/policy.c @@ -0,0 +1,246 @@ +/* + * Encryption policy functions for per-file encryption support. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#include +#include +#include +#include + +static int inode_has_encryption_context(struct inode *inode) +{ + if (!inode->i_sb->s_cop->get_context) + return 0; + return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); +} + +/* + * check whether the policy is consistent with the encryption context + * for the inode + */ +static int is_encryption_context_consistent_with_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context) + return 0; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return 0; + + return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx.flags == policy->flags) && + (ctx.contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx.filenames_encryption_mode == + policy->filenames_encryption_mode)); +} + +static int create_encryption_context_from_policy(struct inode *inode, + const struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + if (inode->i_sb->s_cop->prepare_context) { + res = inode->i_sb->s_cop->prepare_context(inode); + if (res) + return res; + } + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + + if (!fscrypt_valid_contents_enc_mode( + policy->contents_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid contents encryption mode %d\n", __func__, + policy->contents_encryption_mode); + return -EINVAL; + } + + if (!fscrypt_valid_filenames_enc_mode( + policy->filenames_encryption_mode)) { + printk(KERN_WARNING + "%s: Invalid filenames encryption mode %d\n", __func__, + policy->filenames_encryption_mode); + return -EINVAL; + } + + if (policy->flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; + + ctx.contents_encryption_mode = policy->contents_encryption_mode; + ctx.filenames_encryption_mode = policy->filenames_encryption_mode; + ctx.flags = policy->flags; + BUILD_BUG_ON(sizeof(ctx.nonce) != FS_KEY_DERIVATION_NONCE_SIZE); + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + + return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); +} + +int fscrypt_process_policy(struct file *filp, + const struct fscrypt_policy *policy) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (policy->version != 0) + return -EINVAL; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (!inode_has_encryption_context(inode)) { + if (!S_ISDIR(inode->i_mode)) + ret = -EINVAL; + else if (!inode->i_sb->s_cop->empty_dir) + ret = -EOPNOTSUPP; + else if (!inode->i_sb->s_cop->empty_dir(inode)) + ret = -ENOTEMPTY; + else + ret = create_encryption_context_from_policy(inode, + policy); + } else if (!is_encryption_context_consistent_with_policy(inode, + policy)) { + printk(KERN_WARNING + "%s: Policy inconsistent with encryption context\n", + __func__); + ret = -EINVAL; + } + + mnt_drop_write_file(filp); + return ret; +} +EXPORT_SYMBOL(fscrypt_process_policy); + +int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +{ + struct fscrypt_context ctx; + int res; + + if (!inode->i_sb->s_cop->get_context || + !inode->i_sb->s_cop->is_encrypted(inode)) + return -ENODATA; + + res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res != sizeof(ctx)) + return -ENODATA; + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + policy->version = 0; + policy->contents_encryption_mode = ctx.contents_encryption_mode; + policy->filenames_encryption_mode = ctx.filenames_encryption_mode; + policy->flags = ctx.flags; + memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE); + return 0; +} +EXPORT_SYMBOL(fscrypt_get_policy); + +int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) +{ + struct fscrypt_info *parent_ci, *child_ci; + int res; + + if ((parent == NULL) || (child == NULL)) { + printk(KERN_ERR "parent %p child %p\n", parent, child); + BUG_ON(1); + } + + /* no restrictions if the parent directory is not encrypted */ + if (!parent->i_sb->s_cop->is_encrypted(parent)) + return 1; + /* if the child directory is not encrypted, this is always a problem */ + if (!parent->i_sb->s_cop->is_encrypted(child)) + return 0; + res = fscrypt_get_encryption_info(parent); + if (res) + return 0; + res = fscrypt_get_encryption_info(child); + if (res) + return 0; + parent_ci = parent->i_crypt_info; + child_ci = child->i_crypt_info; + if (!parent_ci && !child_ci) + return 1; + if (!parent_ci || !child_ci) + return 0; + + return (memcmp(parent_ci->ci_master_key, + child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags)); +} +EXPORT_SYMBOL(fscrypt_has_permitted_context); + +/** + * fscrypt_inherit_context() - Sets a child context from its parent + * @parent: Parent inode from which the context is inherited. + * @child: Child inode that inherits the context from @parent. + * @fs_data: private data given by FS. + * @preload: preload child i_crypt_info + * + * Return: Zero on success, non-zero otherwise + */ +int fscrypt_inherit_context(struct inode *parent, struct inode *child, + void *fs_data, bool preload) +{ + struct fscrypt_context ctx; + struct fscrypt_info *ci; + int res; + + if (!parent->i_sb->s_cop->set_context) + return -EOPNOTSUPP; + + res = fscrypt_get_encryption_info(parent); + if (res < 0) + return res; + + ci = parent->i_crypt_info; + if (ci == NULL) + return -ENOKEY; + + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; + if (fscrypt_dummy_context_enabled(parent)) { + ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; + ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; + ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); + res = 0; + } else { + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); + } + get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); + res = parent->i_sb->s_cop->set_context(child, &ctx, + sizeof(ctx), fs_data); + if (res) + return res; + return preload ? fscrypt_get_encryption_info(child): 0; +} +EXPORT_SYMBOL(fscrypt_inherit_context); diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index b0a9dc929f88..1852d99df97b 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -1,6 +1,9 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK + select CRYPTO + select KEYS + select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -76,15 +79,7 @@ config F2FS_FS_ENCRYPTION bool "F2FS Encryption" depends on F2FS_FS depends on F2FS_FS_XATTR - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_ECB - select CRYPTO_XTS - select CRYPTO_CTS - select CRYPTO_CTR - select CRYPTO_SHA256 - select KEYS - select ENCRYPTED_KEYS + select FS_ENCRYPTION help Enable encryption of f2fs files and directories. This feature is similar to ecryptfs, but it is more memory @@ -100,3 +95,11 @@ config F2FS_IO_TRACE information and block IO patterns in the filesystem level. If unsure, say N. + +config F2FS_FAULT_INJECTION + bool "F2FS fault injection facility" + depends on F2FS_FS + help + Test F2FS to inject faults such as ENOMEM, ENOSPC, and so on. + + If unsure, say N. diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index 08e101ed914c..ca949ea7c02f 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -7,5 +7,3 @@ f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o f2fs-$(CONFIG_F2FS_IO_TRACE) += trace.o -f2fs-$(CONFIG_F2FS_FS_ENCRYPTION) += crypto_policy.o crypto.o \ - crypto_key.o crypto_fname.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 83dcf7bfd7b8..fb0744b94c2f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -109,14 +109,16 @@ static struct posix_acl *f2fs_acl_from_disk(const char *value, size_t size) return ERR_PTR(-EINVAL); } -static void *f2fs_acl_to_disk(const struct posix_acl *acl, size_t *size) +static void *f2fs_acl_to_disk(struct f2fs_sb_info *sbi, + const struct posix_acl *acl, size_t *size) { struct f2fs_acl_header *f2fs_acl; struct f2fs_acl_entry *entry; int i; - f2fs_acl = kmalloc(sizeof(struct f2fs_acl_header) + acl->a_count * - sizeof(struct f2fs_acl_entry), GFP_NOFS); + f2fs_acl = f2fs_kmalloc(sbi, sizeof(struct f2fs_acl_header) + + acl->a_count * sizeof(struct f2fs_acl_entry), + GFP_NOFS); if (!f2fs_acl) return ERR_PTR(-ENOMEM); @@ -175,7 +177,7 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); if (retval > 0) { - value = kmalloc(retval, GFP_F2FS_ZERO); + value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, @@ -204,7 +206,6 @@ struct posix_acl *f2fs_get_acl(struct inode *inode, int type) static int __f2fs_set_acl(struct inode *inode, int type, struct posix_acl *acl, struct page *ipage) { - struct f2fs_inode_info *fi = F2FS_I(inode); int name_index; void *value = NULL; size_t size = 0; @@ -213,11 +214,13 @@ static int __f2fs_set_acl(struct inode *inode, int type, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl && !ipage) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); - if (error) + if (acl) { + error = posix_acl_equiv_mode(acl, &inode->i_mode); + if (error < 0) return error; - set_acl_inode(fi, inode->i_mode); + set_acl_inode(inode, inode->i_mode); + if (error == 0) + acl = NULL; } break; @@ -232,9 +235,9 @@ static int __f2fs_set_acl(struct inode *inode, int type, } if (acl) { - value = f2fs_acl_to_disk(acl, &size); + value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return (int)PTR_ERR(value); } } @@ -245,7 +248,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, if (!error) set_cached_acl(inode, type, acl); - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); return error; } @@ -386,6 +389,8 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; + f2fs_mark_inode_dirty_sync(inode); + if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, ipage); diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 997ca8edb6cb..2c685185c24d 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -37,11 +37,10 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL extern struct posix_acl *f2fs_get_acl(struct inode *, int); -extern int f2fs_set_acl(struct inode *inode, struct posix_acl *acl, int type); +extern int f2fs_set_acl(struct inode *, struct posix_acl *, int); extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, struct page *); #else -#define f2fs_check_acl NULL #define f2fs_get_acl NULL #define f2fs_set_acl NULL diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index f661d80474be..cb23d6cf676b 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -26,6 +26,14 @@ static struct kmem_cache *ino_entry_slab; struct kmem_cache *inode_entry_slab; +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) +{ + set_ckpt_flags(sbi, CP_ERROR_FLAG); + sbi->sb->s_flags |= MS_RDONLY; + if (!end_io) + f2fs_flush_merged_bios(sbi); +} + /* * We guarantee no failure on the returned page. */ @@ -34,13 +42,14 @@ struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) struct address_space *mapping = META_MAPPING(sbi); struct page *page = NULL; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META); - SetPageUptodate(page); + f2fs_wait_on_page_writeback(page, META, true); + if (!PageUptodate(page)) + SetPageUptodate(page); return page; } @@ -56,14 +65,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, .sbi = sbi, .type = META, .rw = READ_SYNC | REQ_META | REQ_PRIO, - .blk_addr = index, + .old_blkaddr = index, + .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) fio.rw &= ~REQ_META; repeat: - page = grab_cache_page(mapping, index); + page = f2fs_grab_cache_page(mapping, index, false); if (!page) { cond_resched(); goto repeat; @@ -90,7 +100,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, * meta page. */ if (unlikely(!PageUptodate(page))) - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); out: return page; } @@ -143,7 +153,6 @@ bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type) int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { - block_t prev_blk_addr = 0; struct page *page; block_t blkno = start; struct f2fs_io_info fio = { @@ -152,10 +161,12 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, .encrypted_page = NULL, }; + struct blk_plug plug; if (unlikely(type == META_POR)) fio.rw &= ~REQ_META; + blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { if (!is_valid_blkaddr(sbi, blkno, type)) @@ -167,27 +178,25 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, NAT_BLOCK_OFFSET(NM_I(sbi)->max_nid))) blkno = 0; /* get nat block addr */ - fio.blk_addr = current_nat_addr(sbi, + fio.new_blkaddr = current_nat_addr(sbi, blkno * NAT_ENTRY_PER_BLOCK); break; case META_SIT: /* get sit block addr */ - fio.blk_addr = current_sit_addr(sbi, + fio.new_blkaddr = current_sit_addr(sbi, blkno * SIT_ENTRY_PER_BLOCK); - if (blkno != start && prev_blk_addr + 1 != fio.blk_addr) - goto out; - prev_blk_addr = fio.blk_addr; break; case META_SSA: case META_CP: case META_POR: - fio.blk_addr = blkno; + fio.new_blkaddr = blkno; break; default: BUG(); } - page = grab_cache_page(META_MAPPING(sbi), fio.blk_addr); + page = f2fs_grab_cache_page(META_MAPPING(sbi), + fio.new_blkaddr, false); if (!page) continue; if (PageUptodate(page)) { @@ -196,11 +205,13 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; + fio.old_blkaddr = fio.new_blkaddr; f2fs_submit_page_mbio(&fio); f2fs_put_page(page, 0); } out: f2fs_submit_merged_bio(sbi, META, READ); + blk_finish_plug(&plug); return blkno - start; } @@ -210,7 +221,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) bool readahead = false; page = find_get_page(META_MAPPING(sbi), index); - if (!page || (page && !PageUptodate(page))) + if (!page || !PageUptodate(page)) readahead = true; f2fs_put_page(page, 0); @@ -232,13 +243,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, META); write_meta_page(sbi, page); dec_page_count(sbi, F2FS_DIRTY_META); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, META, WRITE); + return 0; redirty_out: @@ -252,13 +267,13 @@ static int f2fs_write_meta_pages(struct address_space *mapping, struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; - trace_f2fs_writepages(mapping->host, wbc, META); - /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, META); + /* if mounting is failed, skip writing node pages */ mutex_lock(&sbi->cp_mutex); diff = nr_pages_to_write(sbi, META, wbc); @@ -269,6 +284,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_META); + trace_f2fs_writepages(mapping->host, wbc, META); return 0; } @@ -276,15 +292,18 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, long nr_to_write) { struct address_space *mapping = META_MAPPING(sbi); - pgoff_t index = 0, end = LONG_MAX, prev = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; struct pagevec pvec; long nwritten = 0; struct writeback_control wbc = { .for_reclaim = 0, }; + struct blk_plug plug; pagevec_init(&pvec, 0); + blk_start_plug(&plug); + while (index <= end) { int i, nr_pages; nr_pages = pagevec_lookup_tag(&pvec, mapping, &index, @@ -296,7 +315,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; - if (prev == LONG_MAX) + if (prev == ULONG_MAX) prev = page->index - 1; if (nr_to_write != LONG_MAX && page->index != prev + 1) { pagevec_release(&pvec); @@ -315,6 +334,9 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, goto continue_unlock; } + f2fs_wait_on_page_writeback(page, META, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; @@ -334,6 +356,8 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, if (nwritten) f2fs_submit_merged_bio(sbi, type, WRITE); + blk_finish_plug(&plug); + return nwritten; } @@ -341,9 +365,10 @@ static int f2fs_set_meta_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, META); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_META); SetPagePrivate(page); f2fs_trace_pid(page); @@ -358,6 +383,9 @@ const struct address_space_operations f2fs_meta_aops = { .set_page_dirty = f2fs_set_meta_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) @@ -410,13 +438,13 @@ static void __remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) spin_unlock(&im->ino_lock); } -void add_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* add new dirty ino entry into list */ __add_ino_entry(sbi, ino, type); } -void remove_dirty_inode(struct f2fs_sb_info *sbi, nid_t ino, int type) +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type) { /* remove dirty ino entry from list */ __remove_ino_entry(sbi, ino, type); @@ -434,12 +462,12 @@ bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode) return e ? true : false; } -void release_dirty_inode(struct f2fs_sb_info *sbi) +void release_ino_entry(struct f2fs_sb_info *sbi, bool all) { struct ino_entry *e, *tmp; int i; - for (i = APPEND_INO; i <= UPDATE_INO; i++) { + for (i = all ? ORPHAN_INO: APPEND_INO; i <= UPDATE_INO; i++) { struct inode_management *im = &sbi->im[i]; spin_lock(&im->ino_lock); @@ -459,6 +487,13 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) int err = 0; spin_lock(&im->ino_lock); + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ORPHAN)) { + spin_unlock(&im->ino_lock); + return -ENOSPC; + } +#endif if (unlikely(im->ino_num >= sbi->max_orphans)) err = -ENOSPC; else @@ -478,10 +513,11 @@ void release_orphan_inode(struct f2fs_sb_info *sbi) spin_unlock(&im->ino_lock); } -void add_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) +void add_orphan_inode(struct inode *inode) { /* add new orphan ino entry into list */ - __add_ino_entry(sbi, ino, ORPHAN_INO); + __add_ino_entry(F2FS_I_SB(inode), inode->i_ino, ORPHAN_INO); + update_inode_page(inode); } void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) @@ -493,8 +529,20 @@ void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; + struct node_info ni; + int err = acquire_orphan_inode(sbi); - inode = f2fs_iget(sbi->sb, ino); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return err; + } + + __add_ino_entry(sbi, ino, ORPHAN_INO); + + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) { /* * there should be a bug that we can't find the entry @@ -508,6 +556,18 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) /* truncate all the data during iput */ iput(inode); + + get_node_info(sbi, ino, &ni); + + /* ENOMEM was fully retried in f2fs_evict_inode. */ + if (ni.blk_addr != NULL_ADDR) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: orphan failed (ino=%x), run fsck to fix.", + __func__, ino); + return -EIO; + } + __remove_ino_entry(sbi, ino, ORPHAN_INO); return 0; } @@ -516,7 +576,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) block_t start_blk, orphan_blocks, i, j; int err; - if (!is_set_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG)) + if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); @@ -540,7 +600,7 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_put_page(page, 1); } /* clear Orphan Flag */ - clear_ckpt_flags(F2FS_CKPT(sbi), CP_ORPHAN_PRESENT_FLAG); + clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); return 0; } @@ -601,45 +661,55 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) } } +static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, + struct f2fs_checkpoint **cp_block, struct page **cp_page, + unsigned long long *version) +{ + unsigned long blk_size = sbi->blocksize; + size_t crc_offset = 0; + __u32 crc = 0; + + *cp_page = get_meta_page(sbi, cp_addr); + *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + + crc_offset = le32_to_cpu((*cp_block)->checksum_offset); + if (crc_offset >= blk_size) { + f2fs_msg(sbi->sb, KERN_WARNING, + "invalid crc_offset: %zu", crc_offset); + return -EINVAL; + } + + crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block + + crc_offset))); + if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { + f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); + return -EINVAL; + } + + *version = cur_cp_version(*cp_block); + return 0; +} + static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { - struct page *cp_page_1, *cp_page_2 = NULL; - unsigned long blk_size = sbi->blocksize; - struct f2fs_checkpoint *cp_block; + struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; - size_t crc_offset; - __u32 crc = 0; + int err; - /* Read the 1st cp block in this CP pack */ - cp_page_1 = get_meta_page(sbi, cp_addr); - - /* get the version number */ - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_1); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_1, version); + if (err) goto invalid_cp1; + pre_version = *version; - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp1; - - pre_version = cur_cp_version(cp_block); - - /* Read the 2nd cp block in this CP pack */ cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1; - cp_page_2 = get_meta_page(sbi, cp_addr); - - cp_block = (struct f2fs_checkpoint *)page_address(cp_page_2); - crc_offset = le32_to_cpu(cp_block->checksum_offset); - if (crc_offset >= blk_size) + err = get_checkpoint_version(sbi, cp_addr, &cp_block, + &cp_page_2, version); + if (err) goto invalid_cp2; - - crc = le32_to_cpu(*((__le32 *)((unsigned char *)cp_block + crc_offset))); - if (!f2fs_crc_valid(crc, cp_block, crc_offset)) - goto invalid_cp2; - - cur_version = cur_cp_version(cp_block); + cur_version = *version; if (cur_version == pre_version) { *version = cur_version; @@ -696,6 +766,10 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) cp_block = (struct f2fs_checkpoint *)page_address(cur_page); memcpy(sbi->ckpt, cp_block, blk_size); + /* Sanity checking of checkpoint */ + if (sanity_check_ckpt(sbi)) + goto fail_no_cp; + if (cp_blks <= 1) goto done; @@ -722,118 +796,94 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) return -EINVAL; } -static int __add_dirty_inode(struct inode *inode, struct inode_entry *new) +static void __add_dirty_inode(struct inode *inode, enum inode_type type) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; - if (is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) - return -EEXIST; + if (is_inode_flag_set(inode, flag)) + return; - set_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - F2FS_I(inode)->dirty_dir = new; - list_add_tail(&new->list, &sbi->dir_inode_list); - stat_inc_dirty_dir(sbi); - return 0; + set_inode_flag(inode, flag); + list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + stat_inc_dirty_inode(sbi, type); +} + +static void __remove_dirty_inode(struct inode *inode, enum inode_type type) +{ + int flag = (type == DIR_INODE) ? FI_DIRTY_DIR : FI_DIRTY_FILE; + + if (get_dirty_pages(inode) || !is_inode_flag_set(inode, flag)) + return; + + list_del_init(&F2FS_I(inode)->dirty_list); + clear_inode_flag(inode, flag); + stat_dec_dirty_inode(F2FS_I_SB(inode), type); } void update_dirty_page(struct inode *inode, struct page *page) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new; - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return; - if (!S_ISDIR(inode->i_mode)) { - inode_inc_dirty_pages(inode); - goto out; - } - - new = f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); + spin_lock(&sbi->inode_lock[type]); + if (type != FILE_INODE || test_opt(sbi, DATA_FLUSH)) + __add_dirty_inode(inode, type); inode_inc_dirty_pages(inode); - spin_unlock(&sbi->dir_inode_lock); + spin_unlock(&sbi->inode_lock[type]); - if (ret) - kmem_cache_free(inode_entry_slab, new); -out: SetPagePrivate(page); f2fs_trace_pid(page); } -void add_dirty_dir_inode(struct inode *inode) +void remove_dirty_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *new = - f2fs_kmem_cache_alloc(inode_entry_slab, GFP_NOFS); - int ret = 0; + enum inode_type type = S_ISDIR(inode->i_mode) ? DIR_INODE : FILE_INODE; - new->inode = inode; - INIT_LIST_HEAD(&new->list); - - spin_lock(&sbi->dir_inode_lock); - ret = __add_dirty_inode(inode, new); - spin_unlock(&sbi->dir_inode_lock); - - if (ret) - kmem_cache_free(inode_entry_slab, new); -} - -void remove_dirty_dir_inode(struct inode *inode) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct inode_entry *entry; - - if (!S_ISDIR(inode->i_mode)) + if (!S_ISDIR(inode->i_mode) && !S_ISREG(inode->i_mode) && + !S_ISLNK(inode->i_mode)) return; - spin_lock(&sbi->dir_inode_lock); - if (get_dirty_pages(inode) || - !is_inode_flag_set(F2FS_I(inode), FI_DIRTY_DIR)) { - spin_unlock(&sbi->dir_inode_lock); + if (type == FILE_INODE && !test_opt(sbi, DATA_FLUSH)) return; - } - entry = F2FS_I(inode)->dirty_dir; - list_del(&entry->list); - F2FS_I(inode)->dirty_dir = NULL; - clear_inode_flag(F2FS_I(inode), FI_DIRTY_DIR); - stat_dec_dirty_dir(sbi); - spin_unlock(&sbi->dir_inode_lock); - kmem_cache_free(inode_entry_slab, entry); - - /* Only from the recovery routine */ - if (is_inode_flag_set(F2FS_I(inode), FI_DELAY_IPUT)) { - clear_inode_flag(F2FS_I(inode), FI_DELAY_IPUT); - iput(inode); - } + spin_lock(&sbi->inode_lock[type]); + __remove_dirty_inode(inode, type); + spin_unlock(&sbi->inode_lock[type]); } -void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) { struct list_head *head; - struct inode_entry *entry; struct inode *inode; + struct f2fs_inode_info *fi; + bool is_dir = (type == DIR_INODE); + + trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); retry: if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - spin_lock(&sbi->dir_inode_lock); + spin_lock(&sbi->inode_lock[type]); - head = &sbi->dir_inode_list; + head = &sbi->inode_list[type]; if (list_empty(head)) { - spin_unlock(&sbi->dir_inode_lock); - return; + spin_unlock(&sbi->inode_lock[type]); + trace_f2fs_sync_dirty_inodes_exit(sbi->sb, is_dir, + get_pages(sbi, is_dir ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); + return 0; } - entry = list_entry(head->next, struct inode_entry, list); - inode = igrab(entry->inode); - spin_unlock(&sbi->dir_inode_lock); + fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[type]); if (inode) { filemap_fdatawrite(inode->i_mapping); iput(inode); @@ -848,6 +898,34 @@ void sync_dirty_dir_inodes(struct f2fs_sb_info *sbi) goto retry; } +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) +{ + struct list_head *head = &sbi->inode_list[DIRTY_META]; + struct inode *inode; + struct f2fs_inode_info *fi; + s64 total = get_pages(sbi, F2FS_DIRTY_IMETA); + + while (total--) { + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (list_empty(head)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 0; + } + fi = list_entry(head->next, struct f2fs_inode_info, + gdirty_list); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + if (inode) { + update_inode_page(inode); + iput(inode); + } + }; + return 0; +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -868,11 +946,17 @@ static int block_operations(struct f2fs_sb_info *sbi) /* write all the dirty dentry pages */ if (get_pages(sbi, F2FS_DIRTY_DENTS)) { f2fs_unlock_all(sbi); - sync_dirty_dir_inodes(sbi); - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; + err = sync_dirty_inodes(sbi, DIR_INODE); + if (err) + goto out; + goto retry_flush_dents; + } + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) goto out; - } goto retry_flush_dents; } @@ -885,10 +969,9 @@ static int block_operations(struct f2fs_sb_info *sbi) if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - sync_node_pages(sbi, 0, &wbc); - if (unlikely(f2fs_cp_error(sbi))) { + err = sync_node_pages(sbi, &wbc); + if (err) { f2fs_unlock_all(sbi); - err = -EIO; goto out; } goto retry_flush_nodes; @@ -901,6 +984,8 @@ static int block_operations(struct f2fs_sb_info *sbi) static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); + + build_free_nids(sbi); f2fs_unlock_all(sbi); } @@ -911,18 +996,48 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!get_pages(sbi, F2FS_WRITEBACK)) + if (!atomic_read(&sbi->nr_wb_bios)) break; - io_schedule(); + io_schedule_timeout(5*HZ); } finish_wait(&sbi->cp_wait, &wait); } -static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + + spin_lock(&sbi->cp_lock); + + if (cpc->reason == CP_UMOUNT) + __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); + + if (cpc->reason == CP_FASTBOOT) + __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); + + if (orphan_num) + __set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + else + __clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) + __set_ckpt_flags(ckpt, CP_FSCK_FLAG); + + /* set this flag to activate crc|cp_ver for recovery */ + __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); + + spin_unlock(&sbi->cp_lock); +} + +static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; nid_t last_nid = nm_i->next_scan_nid; @@ -931,21 +1046,15 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u32 crc32 = 0; int i; int cp_payload_blks = __cp_payload(sbi); - block_t discard_blk = NEXT_FREE_BLKADDR(sbi, curseg); - bool invalidate = false; - - /* - * This avoids to conduct wrong roll-forward operations and uses - * metapages, so should be called prior to sync_meta_pages below. - */ - if (discard_next_dnode(sbi, discard_blk)) - invalidate = true; + struct super_block *sb = sbi->sb; + struct curseg_info *seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + u64 kbytes_written; /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { sync_meta_pages(sbi, META, LONG_MAX); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; } next_free_nid(sbi, &last_nid); @@ -980,10 +1089,12 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); + spin_lock(&sbi->cp_lock); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) - set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else - clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); + spin_unlock(&sbi->cp_lock); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -998,29 +1109,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) cp_payload_blks + data_sum_blocks + orphan_blocks); - if (cpc->reason == CP_UMOUNT) - set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - else - clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - - if (cpc->reason == CP_FASTBOOT) - set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - else - clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); - - if (orphan_num) - set_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - else - clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG); - - if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) - set_ckpt_flags(ckpt, CP_FSCK_FLAG); + /* update ckpt flag for checkpoint */ + update_ckpt_flags(sbi, cpc); /* update SIT/NAT bitmap */ get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); - crc32 = f2fs_crc32(ckpt, le32_to_cpu(ckpt->checksum_offset)); + crc32 = f2fs_crc32(sbi, ckpt, le32_to_cpu(ckpt->checksum_offset)); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); @@ -1030,7 +1126,7 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; /* write out checkpoint buffer at block 0 */ update_meta_page(sbi, ckpt, start_blk++); @@ -1046,6 +1142,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) write_data_summaries(sbi, start_blk); start_blk += data_sum_blocks; + + /* Record write statistics in the hot node summary */ + kbytes_written = sbi->kbytes_written; + if (sb->s_bdev->bd_part) + kbytes_written += BD_PART_WRITTEN(sbi); + + seg_i->journal->info.kbytes_written = cpu_to_le64(kbytes_written); + if (__remain_node_summaries(cpc->reason)) { write_node_summaries(sbi, start_blk); start_blk += NR_CURSEG_NODE_TYPE; @@ -1058,14 +1162,14 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; - filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LONG_MAX); - filemap_fdatawait_range(META_MAPPING(sbi), 0, LONG_MAX); + filemap_fdatawait_range(NODE_MAPPING(sbi), 0, LLONG_MAX); + filemap_fdatawait_range(META_MAPPING(sbi), 0, LLONG_MAX); /* update user_block_counts */ sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; + percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ sync_meta_pages(sbi, META_FLUSH, LONG_MAX); @@ -1073,30 +1177,36 @@ static void do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); - /* - * invalidate meta page which is used temporarily for zeroing out - * block at the end of warm node chain. - */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), discard_blk, - discard_blk); - - release_dirty_inode(sbi); + release_ino_entry(sbi, false); if (unlikely(f2fs_cp_error(sbi))) - return; + return -EIO; clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); + clear_sbi_flag(sbi, SBI_NEED_CP); + + /* + * redirty superblock if metadata like node page or inode cache is + * updated during writing checkpoint. + */ + if (get_pages(sbi, F2FS_DIRTY_NODES) || + get_pages(sbi, F2FS_DIRTY_IMETA)) + set_sbi_flag(sbi, SBI_IS_DIRTY); + + f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_DENTS)); + + return 0; } /* * We guarantee that this checkpoint procedure will not fail. */ -void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long long ckpt_ver; + int err = 0; mutex_lock(&sbi->cp_mutex); @@ -1104,21 +1214,35 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || (cpc->reason == CP_DISCARD && !sbi->discard_blks))) goto out; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + err = -EIO; goto out; - if (f2fs_readonly(sbi->sb)) + } + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; goto out; + } trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "start block_ops"); - if (block_operations(sbi)) + err = block_operations(sbi); + if (err) goto out; trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_flush_merged_bios(sbi); + + /* this is the case of multiple fstrims without any changes */ + if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { + f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); + f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); + f2fs_bug_on(sbi, prefree_segments(sbi)); + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } /* * update checkpoint pack index @@ -1133,7 +1257,7 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ - do_checkpoint(sbi, cpc); + err = do_checkpoint(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); @@ -1143,10 +1267,11 @@ void write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) "checkpoint: version = %llx", ckpt_ver); /* do checkpoint periodically */ - sbi->cp_expires = round_jiffies_up(jiffies + HZ * sbi->cp_interval); + f2fs_update_time(sbi, CP_TIME); + trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); out: mutex_unlock(&sbi->cp_mutex); - trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint"); + return err; } void init_ino_entry_info(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/crypto.c b/fs/f2fs/crypto.c deleted file mode 100644 index 4a62ef14e932..000000000000 --- a/fs/f2fs/crypto.c +++ /dev/null @@ -1,491 +0,0 @@ -/* - * linux/fs/f2fs/crypto.c - * - * Copied from linux/fs/ext4/crypto.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility - * - * This contains encryption functions for f2fs - * - * Written by Michael Halcrow, 2014. - * - * Filename encryption additions - * Uday Savagaonkar, 2014 - * Encryption policy handling additions - * Ildar Muslukhov, 2014 - * Remove ext4_encrypted_zeroout(), - * add f2fs_restore_and_release_control_page() - * Jaegeuk Kim, 2015. - * - * This has not yet undergone a rigorous security audit. - * - * The usage of AES-XTS should conform to recommendations in NIST - * Special Publication 800-38E and IEEE P1619/D16. - */ -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -/* Encryption added and removed here! (L: */ - -static unsigned int num_prealloc_crypto_pages = 32; -static unsigned int num_prealloc_crypto_ctxs = 128; - -module_param(num_prealloc_crypto_pages, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_pages, - "Number of crypto pages to preallocate"); -module_param(num_prealloc_crypto_ctxs, uint, 0444); -MODULE_PARM_DESC(num_prealloc_crypto_ctxs, - "Number of crypto contexts to preallocate"); - -static mempool_t *f2fs_bounce_page_pool; - -static LIST_HEAD(f2fs_free_crypto_ctxs); -static DEFINE_SPINLOCK(f2fs_crypto_ctx_lock); - -static struct workqueue_struct *f2fs_read_workqueue; -static DEFINE_MUTEX(crypto_init); - -static struct kmem_cache *f2fs_crypto_ctx_cachep; -struct kmem_cache *f2fs_crypt_info_cachep; - -/** - * f2fs_release_crypto_ctx() - Releases an encryption context - * @ctx: The encryption context to release. - * - * If the encryption context was allocated from the pre-allocated pool, returns - * it to that pool. Else, frees it. - * - * If there's a bounce page in the context, this frees that. - */ -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *ctx) -{ - unsigned long flags; - - if (ctx->flags & F2FS_WRITE_PATH_FL && ctx->w.bounce_page) { - mempool_free(ctx->w.bounce_page, f2fs_bounce_page_pool); - ctx->w.bounce_page = NULL; - } - ctx->w.control_page = NULL; - if (ctx->flags & F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL) { - kmem_cache_free(f2fs_crypto_ctx_cachep, ctx); - } else { - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - } -} - -/** - * f2fs_get_crypto_ctx() - Gets an encryption context - * @inode: The inode for which we are doing the crypto - * - * Allocates and initializes an encryption context. - * - * Return: An allocated and initialized encryption context on success; error - * value or NULL otherwise. - */ -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *inode) -{ - struct f2fs_crypto_ctx *ctx = NULL; - unsigned long flags; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - - if (ci == NULL) - return ERR_PTR(-ENOKEY); - - /* - * We first try getting the ctx from a free list because in - * the common case the ctx will have an allocated and - * initialized crypto tfm, so it's probably a worthwhile - * optimization. For the bounce page, we first try getting it - * from the kernel allocator because that's just about as fast - * as getting it from a list and because a cache of free pages - * should generally be a "last resort" option for a filesystem - * to be able to do its job. - */ - spin_lock_irqsave(&f2fs_crypto_ctx_lock, flags); - ctx = list_first_entry_or_null(&f2fs_free_crypto_ctxs, - struct f2fs_crypto_ctx, free_list); - if (ctx) - list_del(&ctx->free_list); - spin_unlock_irqrestore(&f2fs_crypto_ctx_lock, flags); - if (!ctx) { - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_NOFS); - if (!ctx) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } else { - ctx->flags &= ~F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL; - } - ctx->flags &= ~F2FS_WRITE_PATH_FL; - return ctx; -} - -/* - * Call f2fs_decrypt on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct f2fs_crypto_ctx *ctx = - container_of(work, struct f2fs_crypto_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = f2fs_decrypt(ctx, page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else - SetPageUptodate(page); - unlock_page(page); - } - f2fs_release_crypto_ctx(ctx); - bio_put(bio); -} - -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(f2fs_read_workqueue, &ctx->r.work); -} - -static void f2fs_crypto_destroy(void) -{ - struct f2fs_crypto_ctx *pos, *n; - - list_for_each_entry_safe(pos, n, &f2fs_free_crypto_ctxs, free_list) - kmem_cache_free(f2fs_crypto_ctx_cachep, pos); - INIT_LIST_HEAD(&f2fs_free_crypto_ctxs); - if (f2fs_bounce_page_pool) - mempool_destroy(f2fs_bounce_page_pool); - f2fs_bounce_page_pool = NULL; -} - -/** - * f2fs_crypto_initialize() - Set up for f2fs encryption. - * - * We only call this when we start accessing encrypted files, since it - * results in memory getting allocated that wouldn't otherwise be used. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_crypto_initialize(void) -{ - int i, res = -ENOMEM; - - if (f2fs_bounce_page_pool) - return 0; - - mutex_lock(&crypto_init); - if (f2fs_bounce_page_pool) - goto already_initialized; - - for (i = 0; i < num_prealloc_crypto_ctxs; i++) { - struct f2fs_crypto_ctx *ctx; - - ctx = kmem_cache_zalloc(f2fs_crypto_ctx_cachep, GFP_KERNEL); - if (!ctx) - goto fail; - list_add(&ctx->free_list, &f2fs_free_crypto_ctxs); - } - - /* must be allocated at the last step to avoid race condition above */ - f2fs_bounce_page_pool = - mempool_create_page_pool(num_prealloc_crypto_pages, 0); - if (!f2fs_bounce_page_pool) - goto fail; - -already_initialized: - mutex_unlock(&crypto_init); - return 0; -fail: - f2fs_crypto_destroy(); - mutex_unlock(&crypto_init); - return res; -} - -/** - * f2fs_exit_crypto() - Shutdown the f2fs encryption system - */ -void f2fs_exit_crypto(void) -{ - f2fs_crypto_destroy(); - - if (f2fs_read_workqueue) - destroy_workqueue(f2fs_read_workqueue); - if (f2fs_crypto_ctx_cachep) - kmem_cache_destroy(f2fs_crypto_ctx_cachep); - if (f2fs_crypt_info_cachep) - kmem_cache_destroy(f2fs_crypt_info_cachep); -} - -int __init f2fs_init_crypto(void) -{ - int res = -ENOMEM; - - f2fs_read_workqueue = alloc_workqueue("f2fs_crypto", WQ_HIGHPRI, 0); - if (!f2fs_read_workqueue) - goto fail; - - f2fs_crypto_ctx_cachep = KMEM_CACHE(f2fs_crypto_ctx, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypto_ctx_cachep) - goto fail; - - f2fs_crypt_info_cachep = KMEM_CACHE(f2fs_crypt_info, - SLAB_RECLAIM_ACCOUNT); - if (!f2fs_crypt_info_cachep) - goto fail; - - return 0; -fail: - f2fs_exit_crypto(); - return res; -} - -void f2fs_restore_and_release_control_page(struct page **page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct f2fs_crypto_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - f2fs_restore_control_page(bounce_page); -} - -void f2fs_restore_control_page(struct page *data_page) -{ - struct f2fs_crypto_ctx *ctx = - (struct f2fs_crypto_ctx *)page_private(data_page); - - set_page_private(data_page, (unsigned long)NULL); - ClearPagePrivate(data_page); - unlock_page(data_page); - f2fs_release_crypto_ctx(ctx); -} - -/** - * f2fs_crypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation - */ -static void f2fs_crypt_complete(struct crypto_async_request *req, int res) -{ - struct f2fs_completion_result *ecr = req->data; - - if (res == -EINPROGRESS) - return; - ecr->res = res; - complete(&ecr->completion); -} - -typedef enum { - F2FS_DECRYPT = 0, - F2FS_ENCRYPT, -} f2fs_direction_t; - -static int f2fs_page_crypto(struct f2fs_crypto_ctx *ctx, - struct inode *inode, - f2fs_direction_t rw, - pgoff_t index, - struct page *src_page, - struct page *dest_page) -{ - u8 xts_tweak[F2FS_XTS_TWEAK_SIZE]; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist dst, src; - struct f2fs_crypt_info *ci = F2FS_I(inode)->i_crypt_info; - struct crypto_ablkcipher *tfm = ci->ci_ctfm; - int res = 0; - - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", - __func__); - return -ENOMEM; - } - ablkcipher_request_set_callback( - req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - f2fs_crypt_complete, &ecr); - - BUILD_BUG_ON(F2FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - F2FS_XTS_TWEAK_SIZE - sizeof(index)); - - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_CACHE_SIZE, 0); - sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_CACHE_SIZE, 0); - ablkcipher_request_set_crypt(req, &src, &dst, PAGE_CACHE_SIZE, - xts_tweak); - if (rw == F2FS_DECRYPT) - res = crypto_ablkcipher_decrypt(req); - else - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } - ablkcipher_request_free(req); - if (res) { - printk_ratelimited(KERN_ERR - "%s: crypto_ablkcipher_encrypt() returned %d\n", - __func__, res); - return res; - } - return 0; -} - -static struct page *alloc_bounce_page(struct f2fs_crypto_ctx *ctx) -{ - ctx->w.bounce_page = mempool_alloc(f2fs_bounce_page_pool, GFP_NOWAIT); - if (ctx->w.bounce_page == NULL) - return ERR_PTR(-ENOMEM); - ctx->flags |= F2FS_WRITE_PATH_FL; - return ctx->w.bounce_page; -} - -/** - * f2fs_encrypt() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. - * - * Called on the page write path. The caller must call - * f2fs_restore_control_page() on the returned ciphertext page to - * release the bounce buffer and the encryption context. - * - * Return: An allocated page with the encrypted content on success. Else, an - * error value or NULL. - */ -struct page *f2fs_encrypt(struct inode *inode, - struct page *plaintext_page) -{ - struct f2fs_crypto_ctx *ctx; - struct page *ciphertext_page = NULL; - int err; - - BUG_ON(!PageLocked(plaintext_page)); - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - return (struct page *)ctx; - - /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx); - if (IS_ERR(ciphertext_page)) - goto err_out; - - ctx->w.control_page = plaintext_page; - err = f2fs_page_crypto(ctx, inode, F2FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page); - if (err) { - ciphertext_page = ERR_PTR(err); - goto err_out; - } - - SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)ctx); - lock_page(ciphertext_page); - return ciphertext_page; - -err_out: - f2fs_release_crypto_ctx(ctx); - return ciphertext_page; -} - -/** - * f2fs_decrypt() - Decrypts a page in-place - * @ctx: The encryption context. - * @page: The page to decrypt. Must be locked. - * - * Decrypts page in-place using the ctx encryption context. - * - * Called from the read completion callback. - * - * Return: Zero on success, non-zero otherwise. - */ -int f2fs_decrypt(struct f2fs_crypto_ctx *ctx, struct page *page) -{ - BUG_ON(!PageLocked(page)); - - return f2fs_page_crypto(ctx, page->mapping->host, - F2FS_DECRYPT, page->index, page, page); -} - -/* - * Convenience function which takes care of allocating and - * deallocating the encryption context - */ -int f2fs_decrypt_one(struct inode *inode, struct page *page) -{ - struct f2fs_crypto_ctx *ctx = f2fs_get_crypto_ctx(inode); - int ret; - - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - ret = f2fs_decrypt(ctx, page); - f2fs_release_crypto_ctx(ctx); - return ret; -} - -bool f2fs_valid_contents_enc_mode(uint32_t mode) -{ - return (mode == F2FS_ENCRYPTION_MODE_AES_256_XTS); -} - -/** - * f2fs_validate_encryption_key_size() - Validate the encryption key size - * @mode: The key mode. - * @size: The key size to validate. - * - * Return: The validated key size for @mode. Zero if invalid. - */ -uint32_t f2fs_validate_encryption_key_size(uint32_t mode, uint32_t size) -{ - if (size == f2fs_encryption_key_size(mode)) - return size; - return 0; -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 8936044dee4c..fc9bfd8f566d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -19,6 +19,8 @@ #include #include #include +#include +#include #include #include "f2fs.h" @@ -33,11 +35,16 @@ static void f2fs_read_end_io(struct bio *bio) struct bio_vec *bvec; int i; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + bio->bi_error = -EIO; +#endif + if (f2fs_bio_encrypted(bio)) { if (bio->bi_error) { - f2fs_release_crypto_ctx(bio->bi_private); + fscrypt_release_ctx(bio->bi_private); } else { - f2fs_end_io_crypto_work(bio->bi_private, bio); + fscrypt_decrypt_bio_pages(bio->bi_private, bio); return; } } @@ -46,7 +53,8 @@ static void f2fs_read_end_io(struct bio *bio) struct page *page = bvec->bv_page; if (!bio->bi_error) { - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { ClearPageUptodate(page); SetPageError(page); @@ -65,19 +73,16 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; - f2fs_restore_and_release_control_page(&page); + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { - set_page_dirty(page); set_bit(AS_EIO, &page->mapping->flags); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, true); } end_page_writeback(page); - dec_page_count(sbi, F2FS_WRITEBACK); } - - if (!get_pages(sbi, F2FS_WRITEBACK) && - !list_empty(&sbi->cp_wait.task_list)) + if (atomic_dec_and_test(&sbi->nr_wb_bios) && + wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); bio_put(bio); @@ -101,6 +106,18 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } +static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, + struct bio *bio, enum page_type type) +{ + if (!is_read_io(rw)) { + atomic_inc(&sbi->nr_wb_bios); + if (f2fs_sb_mounted_hmsmr(sbi->sb) && + current->plug && (type == DATA || type == NODE)) + blk_finish_plug(current->plug); + } + submit_bio(rw, bio); +} + static void __submit_merged_bio(struct f2fs_bio_info *io) { struct f2fs_io_info *fio = &io->fio; @@ -113,12 +130,58 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - submit_bio(fio->rw, io->bio); + __submit_bio(io->sbi, fio->rw, io->bio, fio->type); io->bio = NULL; } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - enum page_type type, int rw) +static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, + struct page *page, nid_t ino) +{ + struct bio_vec *bvec; + struct page *target; + int i; + + if (!io->bio) + return false; + + if (!inode && !page && !ino) + return true; + + bio_for_each_segment_all(bvec, io->bio, i) { + + if (bvec->bv_page->mapping) + target = bvec->bv_page; + else + target = fscrypt_control_page(bvec->bv_page); + + if (inode && inode == target->mapping->host) + return true; + if (page && page == target) + return true; + if (ino && ino == ino_of_node(target)) + return true; + } + + return false; +} + +static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, + struct page *page, nid_t ino, + enum page_type type) +{ + enum page_type btype = PAGE_TYPE_OF_BIO(type); + struct f2fs_bio_info *io = &sbi->write_io[btype]; + bool ret; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, page, ino); + up_read(&io->io_rwsem); + return ret; +} + +static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) { enum page_type btype = PAGE_TYPE_OF_BIO(type); struct f2fs_bio_info *io; @@ -127,6 +190,9 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, down_write(&io->io_rwsem); + if (!__has_merged_page(io, inode, page, ino)) + goto out; + /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; @@ -136,9 +202,31 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; } __submit_merged_bio(io); +out: up_write(&io->io_rwsem); } +void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, + int rw) +{ + __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); +} + +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, + struct inode *inode, struct page *page, + nid_t ino, enum page_type type, int rw) +{ + if (has_merged_page(sbi, inode, page, ino, type)) + __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); +} + +void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_bio(sbi, META, WRITE); +} + /* * Fill the locked page with data located in the block address. * Return unlocked page. @@ -146,20 +234,21 @@ void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, int f2fs_submit_page_bio(struct f2fs_io_info *fio) { struct bio *bio; - struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct page *page = fio->encrypted_page ? + fio->encrypted_page : fio->page; trace_f2fs_submit_page_bio(page, fio); f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->blk_addr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); - if (bio_add_page(bio, page, PAGE_CACHE_SIZE, 0) < PAGE_CACHE_SIZE) { + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } - submit_bio(fio->rw, bio); + __submit_bio(fio->sbi, fio->rw, bio, fio->type); return 0; } @@ -173,39 +262,49 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io = is_read ? &sbi->read_io : &sbi->write_io[btype]; - verify_block_addr(sbi, fio->blk_addr); + if (fio->old_blkaddr != NEW_ADDR) + verify_block_addr(sbi, fio->old_blkaddr); + verify_block_addr(sbi, fio->new_blkaddr); down_write(&io->io_rwsem); - if (!is_read) - inc_page_count(sbi, F2FS_WRITEBACK); - - if (io->bio && (io->last_block_in_bio != fio->blk_addr - 1 || + if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || io->fio.rw != fio->rw)) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->blk_addr, bio_blocks, is_read); + io->bio = __bio_alloc(sbi, fio->new_blkaddr, + bio_blocks, is_read); io->fio = *fio; } bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_CACHE_SIZE, 0) < - PAGE_CACHE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < + PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } - io->last_block_in_bio = fio->blk_addr; + io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); } +static void __set_data_blkaddr(struct dnode_of_data *dn) +{ + struct f2fs_node *rn = F2FS_NODE(dn->node_page); + __le32 *addr_array; + + /* Get physical address of data block */ + addr_array = blkaddr_in_node(rn); + addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); +} + /* * Lock ordering for the change of data block address: * ->data_page @@ -214,39 +313,63 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) */ void set_data_blkaddr(struct dnode_of_data *dn) { - struct f2fs_node *rn; - __le32 *addr_array; - struct page *node_page = dn->node_page; - unsigned int ofs_in_node = dn->ofs_in_node; - - f2fs_wait_on_page_writeback(node_page, NODE); - - rn = F2FS_NODE(node_page); - - /* Get physical address of data block */ - addr_array = blkaddr_in_node(rn); - addr_array[ofs_in_node] = cpu_to_le32(dn->data_blkaddr); - set_page_dirty(node_page); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + __set_data_blkaddr(dn); + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; } -int reserve_new_block(struct dnode_of_data *dn) +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) +{ + dn->data_blkaddr = blkaddr; + set_data_blkaddr(dn); + f2fs_update_extent_cache(dn); +} + +/* dn->ofs_in_node will be returned with up-to-date last block pointer */ +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (!count) + return 0; + + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; - trace_f2fs_reserve_new_block(dn->inode, dn->nid, dn->ofs_in_node); + trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, + dn->ofs_in_node, count); - dn->data_blkaddr = NEW_ADDR; - set_data_blkaddr(dn); - mark_inode_dirty(dn->inode); - sync_inode_page(dn); + f2fs_wait_on_page_writeback(dn->node_page, NODE, true); + + for (; count > 0; dn->ofs_in_node++) { + block_t blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + if (blkaddr == NULL_ADDR) { + dn->data_blkaddr = NEW_ADDR; + __set_data_blkaddr(dn); + count--; + } + } + + if (set_page_dirty(dn->node_page)) + dn->node_changed = true; return 0; } +/* Should keep dn->ofs_in_node unchanged */ +int reserve_new_block(struct dnode_of_data *dn) +{ + unsigned int ofs_in_node = dn->ofs_in_node; + int ret; + + ret = reserve_new_blocks(dn, 1); + dn->ofs_in_node = ofs_in_node; + return ret; +} + int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { bool need_put = dn->inode_page ? false : true; @@ -326,13 +449,14 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, * see, f2fs_add_link -> get_new_data_page -> init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); return page; } - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; fio.page = page; err = f2fs_submit_page_bio(&fio); if (err) @@ -386,14 +510,14 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, /* wait for read completion */ lock_page(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return ERR_PTR(-EIO); + } return page; } @@ -413,7 +537,7 @@ struct page *get_new_data_page(struct inode *inode, struct page *page; struct dnode_of_data dn; int err; -repeat: + page = f2fs_grab_cache_page(mapping, index, true); if (!page) { /* @@ -437,45 +561,42 @@ struct page *get_new_data_page(struct inode *inode, goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); } else { f2fs_put_page(page, 1); - page = get_read_data_page(inode, index, READ_SYNC, true); + /* if ipage exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ipage); + page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - goto repeat; - - /* wait for read completion */ - lock_page(page); + return page; } got_it: if (new_i_size && i_size_read(inode) < - ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)) { - i_size_write(inode, ((loff_t)(index + 1) << PAGE_CACHE_SHIFT)); - /* Only the directory inode sets new_i_size */ - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + ((loff_t)(index + 1) << PAGE_SHIFT)) + f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); return page; } static int __allocate_data_block(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct f2fs_inode_info *fi = F2FS_I(dn->inode); struct f2fs_summary sum; struct node_info ni; int seg = CURSEG_WARM_DATA; pgoff_t fofs; + blkcnt_t count = 1; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, 1))) + if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) return -ENOSPC; alloc: @@ -490,72 +611,43 @@ static int __allocate_data_block(struct dnode_of_data *dn) set_data_blkaddr(dn); /* update i_size */ - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + dn->ofs_in_node; - if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)) - i_size_write(dn->inode, - ((loff_t)(fofs + 1) << PAGE_CACHE_SHIFT)); - - /* direct IO doesn't use extent cache to maximize the performance */ - f2fs_drop_largest_extent(dn->inode, fofs); - + if (i_size_read(dn->inode) < ((loff_t)(fofs + 1) << PAGE_SHIFT)) + f2fs_i_size_write(dn->inode, + ((loff_t)(fofs + 1) << PAGE_SHIFT)); return 0; } -static void __allocate_data_blocks(struct inode *inode, loff_t offset, - size_t count) +ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct dnode_of_data dn; - u64 start = F2FS_BYTES_TO_BLK(offset); - u64 len = F2FS_BYTES_TO_BLK(count); - bool allocated; - u64 end_offset; + struct inode *inode = file_inode(iocb->ki_filp); + struct f2fs_map_blocks map; + ssize_t ret = 0; - while (len) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); + map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); + if (map.m_len > map.m_lblk) + map.m_len -= map.m_lblk; + else + map.m_len = 0; - /* When reading holes, we need its node page */ - set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, start, ALLOC_NODE)) - goto out; + map.m_next_pgofs = NULL; - allocated = false; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); - - while (dn.ofs_in_node < end_offset && len) { - block_t blkaddr; - - if (unlikely(f2fs_cp_error(sbi))) - goto sync_out; - - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); - if (blkaddr == NULL_ADDR || blkaddr == NEW_ADDR) { - if (__allocate_data_block(&dn)) - goto sync_out; - allocated = true; - } - len--; - start++; - dn.ofs_in_node++; - } - - if (allocated) - sync_inode_page(&dn); - - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + if (iocb->ki_flags & IOCB_DIRECT) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } - return; - -sync_out: - if (allocated) - sync_inode_page(&dn); - f2fs_put_dnode(&dn); -out: - f2fs_unlock_op(sbi); - return; + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + } + if (!f2fs_has_inline_data(inode)) + return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + return ret; } /* @@ -567,156 +659,181 @@ static void __allocate_data_blocks(struct inode *inode, loff_t offset, * b. do not use extent cache for better performance * c. give the block addresses to blockdev */ -static int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int create, int flag) { unsigned int maxblocks = map->m_len; struct dnode_of_data dn; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int mode = create ? ALLOC_NODE : LOOKUP_NODE_RA; - pgoff_t pgofs, end_offset; + int mode = create ? ALLOC_NODE : LOOKUP_NODE; + pgoff_t pgofs, end_offset, end; int err = 0, ofs = 1; + unsigned int ofs_in_node, last_ofs_in_node; + blkcnt_t prealloc; struct extent_info ei; bool allocated = false; + block_t blkaddr; + + if (!maxblocks) + return 0; map->m_len = 0; map->m_flags = 0; /* it only supports block size == page size */ pgofs = (pgoff_t)map->m_lblk; + end = pgofs + maxblocks; - if (f2fs_lookup_extent_cache(inode, pgofs, &ei)) { + if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) { map->m_pblk = ei.blk + pgofs - ei.fofs; map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs); map->m_flags = F2FS_MAP_MAPPED; goto out; } +next_dnode: if (create) - f2fs_lock_op(F2FS_I_SB(inode)); + f2fs_lock_op(sbi); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, pgofs, mode); if (err) { - if (err == -ENOENT) + if (flag == F2FS_GET_BLOCK_BMAP) + map->m_pblk = 0; + if (err == -ENOENT) { err = 0; + if (map->m_next_pgofs) + *map->m_next_pgofs = + get_next_page_offset(&dn, pgofs); + } goto unlock_out; } - if (dn.data_blkaddr == NEW_ADDR || dn.data_blkaddr == NULL_ADDR) { + prealloc = 0; + ofs_in_node = dn.ofs_in_node; + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + +next_block: + blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; - goto put_out; + goto sync_out; + } + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (blkaddr == NULL_ADDR) { + prealloc++; + last_ofs_in_node = dn.ofs_in_node; + } + } else { + err = __allocate_data_block(&dn); + if (!err) { + set_inode_flag(inode, FI_APPEND_WRITE); + allocated = true; + } } - err = __allocate_data_block(&dn); if (err) - goto put_out; - allocated = true; + goto sync_out; map->m_flags = F2FS_MAP_NEW; + blkaddr = dn.data_blkaddr; } else { - if (flag != F2FS_GET_BLOCK_FIEMAP || - dn.data_blkaddr != NEW_ADDR) { - if (flag == F2FS_GET_BLOCK_BMAP) - err = -ENOENT; - goto put_out; + if (flag == F2FS_GET_BLOCK_BMAP) { + map->m_pblk = 0; + goto sync_out; } - - /* - * preallocated unwritten block should be mapped - * for fiemap. - */ - if (dn.data_blkaddr == NEW_ADDR) - map->m_flags = F2FS_MAP_UNWRITTEN; + if (flag == F2FS_GET_BLOCK_FIEMAP && + blkaddr == NULL_ADDR) { + if (map->m_next_pgofs) + *map->m_next_pgofs = pgofs + 1; + } + if (flag != F2FS_GET_BLOCK_FIEMAP || + blkaddr != NEW_ADDR) + goto sync_out; } } - map->m_flags |= F2FS_MAP_MAPPED; - map->m_pblk = dn.data_blkaddr; - map->m_len = 1; + if (flag == F2FS_GET_BLOCK_PRE_AIO) + goto skip; - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + if (map->m_len == 0) { + /* preallocated unwritten block should be mapped for fiemap. */ + if (blkaddr == NEW_ADDR) + map->m_flags |= F2FS_MAP_UNWRITTEN; + map->m_flags |= F2FS_MAP_MAPPED; + + map->m_pblk = blkaddr; + map->m_len = 1; + } else if ((map->m_pblk != NEW_ADDR && + blkaddr == (map->m_pblk + ofs)) || + (map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) || + flag == F2FS_GET_BLOCK_PRE_DIO) { + ofs++; + map->m_len++; + } else { + goto sync_out; + } + +skip: dn.ofs_in_node++; pgofs++; -get_next: - if (dn.ofs_in_node >= end_offset) { - if (allocated) - sync_inode_page(&dn); - allocated = false; - f2fs_put_dnode(&dn); + /* preallocate blocks in batch for one dnode page */ + if (flag == F2FS_GET_BLOCK_PRE_AIO && + (pgofs == end || dn.ofs_in_node == end_offset)) { - set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, mode); - if (err) { - if (err == -ENOENT) - err = 0; - goto unlock_out; + dn.ofs_in_node = ofs_in_node; + err = reserve_new_blocks(&dn, prealloc); + if (err) + goto sync_out; + allocated = dn.node_changed; + + map->m_len += dn.ofs_in_node - ofs_in_node; + if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { + err = -ENOSPC; + goto sync_out; } - - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + dn.ofs_in_node = end_offset; } - if (maxblocks > map->m_len) { - block_t blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (pgofs >= end) + goto sync_out; + else if (dn.ofs_in_node < end_offset) + goto next_block; - if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { - if (create) { - if (unlikely(f2fs_cp_error(sbi))) { - err = -EIO; - goto sync_out; - } - err = __allocate_data_block(&dn); - if (err) - goto sync_out; - allocated = true; - map->m_flags |= F2FS_MAP_NEW; - blkaddr = dn.data_blkaddr; - } else { - /* - * we only merge preallocated unwritten blocks - * for fiemap. - */ - if (flag != F2FS_GET_BLOCK_FIEMAP || - blkaddr != NEW_ADDR) - goto sync_out; - } - } + f2fs_put_dnode(&dn); - /* Give more consecutive addresses for the readahead */ - if ((map->m_pblk != NEW_ADDR && - blkaddr == (map->m_pblk + ofs)) || - (map->m_pblk == NEW_ADDR && - blkaddr == NEW_ADDR)) { - ofs++; - dn.ofs_in_node++; - pgofs++; - map->m_len++; - goto get_next; - } + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); } + allocated = false; + goto next_dnode; + sync_out: - if (allocated) - sync_inode_page(&dn); -put_out: f2fs_put_dnode(&dn); unlock_out: - if (create) - f2fs_unlock_op(F2FS_I_SB(inode)); + if (create) { + f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, allocated); + } out: trace_f2fs_map_blocks(inode, map, err); return err; } static int __get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh, int create, int flag) + struct buffer_head *bh, int create, int flag, + pgoff_t *next_pgofs) { struct f2fs_map_blocks map; int ret; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; + map.m_next_pgofs = next_pgofs; ret = f2fs_map_blocks(inode, &map, create, flag); if (!ret) { @@ -728,23 +845,29 @@ static int __get_data_block(struct inode *inode, sector_t iblock, } static int get_data_block(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create, int flag) + struct buffer_head *bh_result, int create, int flag, + pgoff_t *next_pgofs) { - return __get_data_block(inode, iblock, bh_result, create, flag); + return __get_data_block(inode, iblock, bh_result, create, + flag, next_pgofs); } static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO); + F2FS_GET_BLOCK_DIO, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { + /* Block number less than F2FS MAX BLOCKS */ + if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks)) + return -EFBIG; + return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_BMAP); + F2FS_GET_BLOCK_BMAP, NULL); } static inline sector_t logical_to_blk(struct inode *inode, loff_t offset) @@ -762,10 +885,10 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { struct buffer_head map_bh; sector_t start_blk, last_blk; - loff_t isize = i_size_read(inode); + pgoff_t next_pgofs; + loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; - bool past_eof = false, whole_file = false; int ret = 0; ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC); @@ -778,82 +901,64 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); - if (len >= isize) { - whole_file = true; - len = isize; - } + isize = i_size_read(inode); + if (start >= isize) + goto out; + + if (start + len > isize) + len = isize - start; if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); start_blk = logical_to_blk(inode, start); last_blk = logical_to_blk(inode, start + len - 1); + next: memset(&map_bh, 0, sizeof(struct buffer_head)); map_bh.b_size = len; ret = get_data_block(inode, start_blk, &map_bh, 0, - F2FS_GET_BLOCK_FIEMAP); + F2FS_GET_BLOCK_FIEMAP, &next_pgofs); if (ret) goto out; /* HOLE */ if (!buffer_mapped(&map_bh)) { - start_blk++; - - if (!past_eof && blk_to_logical(inode, start_blk) >= isize) - past_eof = 1; - - if (past_eof && size) { - flags |= FIEMAP_EXTENT_LAST; - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - } else if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - size = 0; - } - - /* if we have holes up to/past EOF then we're done */ - if (start_blk > last_blk || past_eof || ret) - goto out; - } else { - if (start_blk > last_blk && !whole_file) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - goto out; - } - - /* - * if size != 0 then we know we already have an extent - * to add, so add it. + start_blk = next_pgofs; + /* Go through holes util pass the EOF */ + if (blk_to_logical(inode, start_blk) < isize) + goto prep_next; + /* Found a hole beyond isize means no more extents. + * Note that the premise is that filesystems don't + * punch holes beyond isize and keep size unchanged. */ - if (size) { - ret = fiemap_fill_next_extent(fieinfo, logical, - phys, size, flags); - if (ret) - goto out; - } - - logical = blk_to_logical(inode, start_blk); - phys = blk_to_logical(inode, map_bh.b_blocknr); - size = map_bh.b_size; - flags = 0; - if (buffer_unwritten(&map_bh)) - flags = FIEMAP_EXTENT_UNWRITTEN; - - start_blk += logical_to_blk(inode, size); - - /* - * If we are past the EOF, then we need to make sure as - * soon as we find a hole that the last extent we found - * is marked with FIEMAP_EXTENT_LAST - */ - if (!past_eof && logical + size >= isize) - past_eof = true; + flags |= FIEMAP_EXTENT_LAST; } + + if (size) { + if (f2fs_encrypted_inode(inode)) + flags |= FIEMAP_EXTENT_DATA_ENCRYPTED; + + ret = fiemap_fill_next_extent(fieinfo, logical, + phys, size, flags); + } + + if (start_blk > last_blk || ret) + goto out; + + logical = blk_to_logical(inode, start_blk); + phys = blk_to_logical(inode, map_bh.b_blocknr); + size = map_bh.b_size; + flags = 0; + if (buffer_unwritten(&map_bh)) + flags = FIEMAP_EXTENT_UNWRITTEN; + + start_blk += logical_to_blk(inode, size); + +prep_next: cond_resched(); if (fatal_signal_pending(current)) ret = -EINTR; @@ -863,10 +968,41 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } +static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct block_device *bdev = sbi->sb->s_bdev; + struct bio *bio; + + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + + return bio; +} + /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -885,13 +1021,13 @@ static int f2fs_mpage_readpages(struct address_space *mapping, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - struct block_device *bdev = inode->i_sb->s_bdev; struct f2fs_map_blocks map; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; + map.m_next_pgofs = NULL; for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { @@ -930,7 +1066,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_READ)) goto set_error_page; } got_it: @@ -943,8 +1079,9 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto confused; } } else { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); - SetPageUptodate(page); + zero_user_segment(page, 0, PAGE_SIZE); + if (!PageUptodate(page)) + SetPageUptodate(page); unlock_page(page); goto next_page; } @@ -955,35 +1092,15 @@ static int f2fs_mpage_readpages(struct address_space *mapping, */ if (bio && (last_block_in_bio != block_nr - 1)) { submit_and_realloc: - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } if (bio == NULL) { - struct f2fs_crypto_ctx *ctx = NULL; - - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { - - ctx = f2fs_get_crypto_ctx(inode); - if (IS_ERR(ctx)) - goto set_error_page; - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback( - F2FS_I_SB(inode), block_nr); - } - - bio = bio_alloc(GFP_KERNEL, - min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - f2fs_release_crypto_ctx(ctx); + bio = f2fs_grab_bio(inode, block_nr, nr_pages); + if (IS_ERR(bio)) { + bio = NULL; goto set_error_page; } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(block_nr); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -993,22 +1110,22 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; set_error_page: SetPageError(page); - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); unlock_page(page); goto next_page; confused: if (bio) { - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; } unlock_page(page); next_page: if (pages) - page_cache_release(page); + put_page(page); } BUG_ON(pages && !list_empty(pages)); if (bio) - submit_bio(READ, bio); + __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); return 0; } @@ -1055,23 +1172,33 @@ int do_write_data_page(struct f2fs_io_info *fio) if (err) return err; - fio->blk_addr = dn.data_blkaddr; + fio->old_blkaddr = dn.data_blkaddr; /* This page is already truncated */ - if (fio->blk_addr == NULL_ADDR) { + if (fio->old_blkaddr == NULL_ADDR) { ClearPageUptodate(page); goto out_writepage; } if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { + gfp_t gfp_flags = GFP_NOFS; /* wait for GCed encrypted page writeback */ f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->blk_addr); - - fio->encrypted_page = f2fs_encrypt(inode, fio->page); + fio->old_blkaddr); +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + gfp_flags); if (IS_ERR(fio->encrypted_page)) { err = PTR_ERR(fio->encrypted_page); + if (err == -ENOMEM) { + /* flush pending ios and wait for a while */ + f2fs_flush_merged_bios(F2FS_I_SB(inode)); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + err = 0; + goto retry_encrypt; + } goto out_writepage; } } @@ -1082,20 +1209,19 @@ int do_write_data_page(struct f2fs_io_info *fio) * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->blk_addr != NEW_ADDR && + if (unlikely(fio->old_blkaddr != NEW_ADDR && !is_cold_data(page) && + !IS_ATOMIC_WRITTEN_PAGE(page) && need_inplace_update(inode))) { rewrite_data_page(fio); - set_inode_flag(F2FS_I(inode), FI_UPDATE_WRITE); + set_inode_flag(inode, FI_UPDATE_WRITE); trace_f2fs_do_write_data_page(page, IPU); } else { write_data_page(&dn, fio); - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); } out_writepage: f2fs_put_dnode(&dn); @@ -1109,7 +1235,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); loff_t i_size = i_size_read(inode); const pgoff_t end_index = ((unsigned long long) i_size) - >> PAGE_CACHE_SHIFT; + >> PAGE_SHIFT; + loff_t psize = (page->index + 1) << PAGE_SHIFT; unsigned offset = 0; bool need_balance_fs = false; int err = 0; @@ -1130,37 +1257,37 @@ static int f2fs_write_data_page(struct page *page, * If the offset is out-of-range of file size, * this page does not have to be written to disk. */ - offset = i_size & (PAGE_CACHE_SIZE - 1); + offset = i_size & (PAGE_SIZE - 1); if ((page->index >= end_index + 1) || !offset) goto out; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + zero_user_segment(page, offset, PAGE_SIZE); write: if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; - if (f2fs_is_volatile_file(inode) && !wbc->for_reclaim && - available_free_memory(sbi, BASE_CHECK)) + /* we should not write 0'th page having journal header */ + if (f2fs_is_volatile_file(inode) && (!page->index || + (!wbc->for_reclaim && + available_free_memory(sbi, BASE_CHECK)))) goto redirty_out; + /* we should bypass data pages to proceed the kworkder jobs */ + if (unlikely(f2fs_cp_error(sbi))) { + mapping_set_error(page->mapping, -EIO); + goto out; + } + /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; err = do_write_data_page(&fio); goto done; } - /* we should bypass data pages to proceed the kworkder jobs */ - if (unlikely(f2fs_cp_error(sbi))) { - SetPageError(page); - goto out; - } - if (!wbc->for_reclaim) need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0)) + else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; err = -EAGAIN; @@ -1169,6 +1296,8 @@ static int f2fs_write_data_page(struct page *page, err = f2fs_write_inline_data(inode, page); if (err == -EAGAIN) err = do_write_data_page(&fio); + if (F2FS_I(inode)->last_disk_size < psize) + F2FS_I(inode)->last_disk_size = psize; f2fs_unlock_op(sbi); done: if (err && err != -ENOENT) @@ -1179,25 +1308,24 @@ static int f2fs_write_data_page(struct page *page, inode_dec_dirty_pages(inode); if (err) ClearPageUptodate(page); + + if (wbc->for_reclaim) { + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + remove_dirty_inode(inode); + } + unlock_page(page); - if (need_balance_fs) - f2fs_balance_fs(sbi); - if (wbc->for_reclaim) + f2fs_balance_fs(sbi, need_balance_fs); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, DATA, WRITE); + return 0; redirty_out: redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - -static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, - void *data) -{ - struct address_space *mapping = data; - int ret = mapping->a_ops->writepage(page, wbc); - mapping_set_error(mapping, ret); - return ret; + unlock_page(page); + return err; } /* @@ -1206,8 +1334,7 @@ static int __f2fs_writepage(struct page *page, struct writeback_control *wbc, * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc, writepage_t writepage, - void *data) + struct writeback_control *wbc) { int ret = 0; int done = 0; @@ -1220,10 +1347,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, int cycled; int range_whole = 0; int tag; - int step = 0; + int nwritten = 0; pagevec_init(&pvec, 0); -next: + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1233,8 +1360,8 @@ static int f2fs_write_cache_pages(struct address_space *mapping, cycled = 0; end = -1; } else { - index = wbc->range_start >> PAGE_CACHE_SHIFT; - end = wbc->range_end >> PAGE_CACHE_SHIFT; + index = wbc->range_start >> PAGE_SHIFT; + end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) range_whole = 1; cycled = 1; /* ignore range_cyclic tests */ @@ -1278,12 +1405,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, goto continue_unlock; } - if (step == is_cold_data(page)) - goto continue_unlock; - if (PageWriteback(page)) { if (wbc->sync_mode != WB_SYNC_NONE) - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, + DATA, true); else goto continue_unlock; } @@ -1292,16 +1417,13 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = (*writepage)(page, wbc, data); + ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(page); - ret = 0; - } else { - done_index = page->index + 1; - done = 1; - break; - } + done_index = page->index + 1; + done = 1; + break; + } else { + nwritten++; } if (--wbc->nr_to_write <= 0 && @@ -1314,11 +1436,6 @@ static int f2fs_write_cache_pages(struct address_space *mapping, cond_resched(); } - if (step < 1) { - step++; - goto next; - } - if (!cycled && !done) { cycled = 1; index = 0; @@ -1328,6 +1445,10 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; + if (nwritten) + f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, + NULL, 0, DATA, WRITE); + return ret; } @@ -1336,11 +1457,8 @@ static int f2fs_write_data_pages(struct address_space *mapping, { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool locked = false; + struct blk_plug plug; int ret; - long diff; - - trace_f2fs_writepages(mapping->host, wbc, DATA); /* deal with chardevs and other special file */ if (!mapping->a_ops->writepage) @@ -1355,41 +1473,119 @@ static int f2fs_write_data_pages(struct address_space *mapping, available_free_memory(sbi, DIRTY_DENTS)) goto skip_write; + /* skip writing during file defragment */ + if (is_inode_flag_set(inode, FI_DO_DEFRAG)) + goto skip_write; + /* during POR, we don't need to trigger writepage at all. */ if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto skip_write; - diff = nr_pages_to_write(sbi, DATA, wbc); + trace_f2fs_writepages(mapping->host, wbc, DATA); - if (!S_ISDIR(inode->i_mode)) { - mutex_lock(&sbi->writepages); - locked = true; - } - ret = f2fs_write_cache_pages(mapping, wbc, __f2fs_writepage, mapping); - f2fs_submit_merged_bio(sbi, DATA, WRITE); - if (locked) - mutex_unlock(&sbi->writepages); + blk_start_plug(&plug); + ret = f2fs_write_cache_pages(mapping, wbc); + blk_finish_plug(&plug); + /* + * if some pages were truncated, we cannot guarantee its mapping->host + * to detect pending bios. + */ - remove_dirty_dir_inode(inode); - - wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); + remove_dirty_inode(inode); return ret; skip_write: wbc->pages_skipped += get_dirty_pages(inode); + trace_f2fs_writepages(mapping->host, wbc, DATA); return 0; } static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; + loff_t i_size = i_size_read(inode); - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - truncate_blocks(inode, inode->i_size, true); + if (to > i_size) { + truncate_pagecache(inode, i_size); + truncate_blocks(inode, i_size, true); } } +static int prepare_write_begin(struct f2fs_sb_info *sbi, + struct page *page, loff_t pos, unsigned len, + block_t *blk_addr, bool *node_changed) +{ + struct inode *inode = page->mapping->host; + pgoff_t index = page->index; + struct dnode_of_data dn; + struct page *ipage; + bool locked = false; + struct extent_info ei; + int err = 0; + + /* + * we already allocated all the blocks, so we don't need to get + * the block addresses when there is no need to fill the page. + */ + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + return 0; + + if (f2fs_has_inline_data(inode) || + (pos & PAGE_MASK) >= i_size_read(inode)) { + f2fs_lock_op(sbi); + locked = true; + } +restart: + /* check inline_data */ + ipage = get_node_page(sbi, inode->i_ino); + if (IS_ERR(ipage)) { + err = PTR_ERR(ipage); + goto unlock_out; + } + + set_new_dnode(&dn, inode, ipage, ipage, 0); + + if (f2fs_has_inline_data(inode)) { + if (pos + len <= MAX_INLINE_DATA) { + read_inline_data(page, ipage); + set_inode_flag(inode, FI_DATA_EXIST); + if (inode->i_nlink) + set_inline_node(ipage); + } else { + err = f2fs_convert_inline_page(&dn, page); + if (err) + goto out; + if (dn.data_blkaddr == NULL_ADDR) + err = f2fs_get_block(&dn, index); + } + } else if (locked) { + err = f2fs_get_block(&dn, index); + } else { + if (f2fs_lookup_extent_cache(inode, index, &ei)) { + dn.data_blkaddr = ei.blk + index - ei.fofs; + } else { + /* hole case */ + err = get_dnode_of_data(&dn, index, LOOKUP_NODE); + if (err || dn.data_blkaddr == NULL_ADDR) { + f2fs_put_dnode(&dn); + f2fs_lock_op(sbi); + locked = true; + goto restart; + } + } + } + + /* convert_inline_page can make node_changed */ + *blk_addr = dn.data_blkaddr; + *node_changed = dn.node_changed; +out: + f2fs_put_dnode(&dn); +unlock_out: + if (locked) + f2fs_unlock_op(sbi); + return err; +} + static int f2fs_write_begin(struct file *file, struct address_space *mapping, loff_t pos, unsigned len, unsigned flags, struct page **pagep, void **fsdata) @@ -1397,9 +1593,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *page = NULL; - struct page *ipage; - pgoff_t index = ((unsigned long long) pos) >> PAGE_CACHE_SHIFT; - struct dnode_of_data dn; + pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT; + bool need_balance = false; + block_t blkaddr = NULL_ADDR; int err = 0; if (trace_android_fs_datawrite_start_enabled()) { @@ -1414,8 +1610,6 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, } trace_f2fs_write_begin(inode, pos, len, flags); - f2fs_balance_fs(sbi); - /* * We should check this at this moment to avoid deadlock on inode page * and #0 page. The locking rule for inline_data conversion should be: @@ -1435,98 +1629,63 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, *pagep = page; - f2fs_lock_op(sbi); - - /* check inline_data */ - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); - goto unlock_fail; - } - - set_new_dnode(&dn, inode, ipage, ipage, 0); - - if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { - read_inline_data(page, ipage); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - sync_inode_page(&dn); - goto put_next; - } - err = f2fs_convert_inline_page(&dn, page); - if (err) - goto put_fail; - } - - err = f2fs_get_block(&dn, index); + err = prepare_write_begin(sbi, page, pos, len, + &blkaddr, &need_balance); if (err) - goto put_fail; -put_next: - f2fs_put_dnode(&dn); - f2fs_unlock_op(sbi); + goto fail; - f2fs_wait_on_page_writeback(page, DATA); + if (need_balance && has_not_enough_free_secs(sbi, 0, 0)) { + unlock_page(page); + f2fs_balance_fs(sbi, true); + lock_page(page); + if (page->mapping != mapping) { + /* The page got truncated from under us */ + f2fs_put_page(page, 1); + goto repeat; + } + } + + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - if (len == PAGE_CACHE_SIZE) - goto out_update; - if (PageUptodate(page)) - goto out_clear; + if (len == PAGE_SIZE || PageUptodate(page)) + return 0; - if ((pos & PAGE_CACHE_MASK) >= i_size_read(inode)) { - unsigned start = pos & (PAGE_CACHE_SIZE - 1); - unsigned end = start + len; - - /* Reading beyond i_size is simple: memset to zero */ - zero_user_segments(page, 0, start, end, PAGE_CACHE_SIZE); - goto out_update; - } - - if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + if (blkaddr == NEW_ADDR) { + zero_user_segment(page, 0, PAGE_SIZE); + SetPageUptodate(page); } else { - struct f2fs_io_info fio = { - .sbi = sbi, - .type = DATA, - .rw = READ_SYNC, - .blk_addr = dn.data_blkaddr, - .page = page, - .encrypted_page = NULL, - }; - err = f2fs_submit_page_bio(&fio); - if (err) - goto fail; + struct bio *bio; - lock_page(page); - if (unlikely(!PageUptodate(page))) { - err = -EIO; + bio = f2fs_grab_bio(inode, blkaddr, 1); + if (IS_ERR(bio)) { + err = PTR_ERR(bio); goto fail; } + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + err = -EFAULT; + goto fail; + } + + __submit_bio(sbi, READ_SYNC, bio, DATA); + + lock_page(page); if (unlikely(page->mapping != mapping)) { f2fs_put_page(page, 1); goto repeat; } - - /* avoid symlink page */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - err = f2fs_decrypt_one(inode, page); - if (err) - goto fail; + if (unlikely(!PageUptodate(page))) { + err = -EIO; + goto fail; } } -out_update: - SetPageUptodate(page); -out_clear: - clear_cold_data(page); return 0; -put_fail: - f2fs_put_dnode(&dn); -unlock_fail: - f2fs_unlock_op(sbi); fail: f2fs_put_page(page, 1); f2fs_write_failed(mapping, pos + len); @@ -1543,15 +1702,28 @@ static int f2fs_write_end(struct file *file, trace_android_fs_datawrite_end(inode, pos, len); trace_f2fs_write_end(inode, pos, len, copied); - set_page_dirty(page); - - if (pos + copied > i_size_read(inode)) { - i_size_write(inode, pos + copied); - mark_inode_dirty(inode); - update_inode_page(inode); + /* + * This should be come from len == PAGE_SIZE, and we expect copied + * should be PAGE_SIZE. Otherwise, we treat it with zero copied and + * let generic_perform_write() try to copy data again through copied=0. + */ + if (!PageUptodate(page)) { + if (unlikely(copied != PAGE_SIZE)) + copied = 0; + else + SetPageUptodate(page); } + if (!copied) + goto unlock_out; + set_page_dirty(page); + clear_cold_data(page); + + if (pos + copied > i_size_read(inode)) + f2fs_i_size_write(inode, pos + copied); +unlock_out: f2fs_put_page(page, 1); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return copied; } @@ -1570,29 +1742,22 @@ static int check_direct_IO(struct inode *inode, struct iov_iter *iter, } static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, - loff_t offset) + loff_t offset) { - struct file *file = iocb->ki_filp; - struct address_space *mapping = file->f_mapping; + struct address_space *mapping = iocb->ki_filp->f_mapping; struct inode *inode = mapping->host; size_t count = iov_iter_count(iter); + int rw = iov_iter_rw(iter); int err; - /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - err = check_direct_IO(inode, iter, offset); if (err) return err; - trace_f2fs_direct_IO_enter(inode, offset, count, iov_iter_rw(iter)); + if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + return 0; + if (test_opt(F2FS_I_SB(inode), LFS)) + return 0; if (trace_android_fs_dataread_start_enabled() && (iov_iter_rw(iter) == READ)) { @@ -1616,18 +1781,18 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, current->pid, path, current->comm); } - if (iov_iter_rw(iter) == WRITE) { - __allocate_data_blocks(inode, offset, count); - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) { - err = -EIO; - goto out; - } - } + trace_f2fs_direct_IO_enter(inode, offset, count, rw); + down_read(&F2FS_I(inode)->dio_rwsem[rw]); err = blockdev_direct_IO(iocb, inode, iter, offset, get_data_block_dio); -out: - if (err < 0 && iov_iter_rw(iter) == WRITE) - f2fs_write_failed(mapping, offset + count); + up_read(&F2FS_I(inode)->dio_rwsem[rw]); + + if (rw == WRITE) { + if (err > 0) + set_inode_flag(inode, FI_UPDATE_WRITE); + else if (err < 0) + f2fs_write_failed(mapping, offset + count); + } if (trace_android_fs_dataread_start_enabled() && (iov_iter_rw(iter) == READ)) @@ -1636,7 +1801,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, (iov_iter_rw(iter) == WRITE)) trace_android_fs_datawrite_end(inode, offset, count); - trace_f2fs_direct_IO_exit(inode, offset, count, iov_iter_rw(iter), err); + trace_f2fs_direct_IO_exit(inode, offset, count, rw, err); return err; } @@ -1648,7 +1813,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, struct f2fs_sb_info *sbi = F2FS_I_SB(inode); if (inode->i_ino >= F2FS_ROOT_INO(sbi) && - (offset % PAGE_CACHE_SIZE || length != PAGE_CACHE_SIZE)) + (offset % PAGE_SIZE || length != PAGE_SIZE)) return; if (PageDirty(page)) { @@ -1664,6 +1829,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, if (IS_ATOMIC_WRITTEN_PAGE(page)) return; + set_page_private(page, 0); ClearPagePrivate(page); } @@ -1677,10 +1843,42 @@ int f2fs_release_page(struct page *page, gfp_t wait) if (IS_ATOMIC_WRITTEN_PAGE(page)) return 0; + set_page_private(page, 0); ClearPagePrivate(page); return 1; } +/* + * This was copied from __set_page_dirty_buffers which gives higher performance + * in very high speed storages. (e.g., pmem) + */ +void f2fs_set_page_dirty_nobuffers(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct mem_cgroup *memcg; + unsigned long flags; + + if (unlikely(!mapping)) + return; + + spin_lock(&mapping->private_lock); + memcg = mem_cgroup_begin_page_stat(page); + SetPageDirty(page); + spin_unlock(&mapping->private_lock); + + spin_lock_irqsave(&mapping->tree_lock, flags); + WARN_ON_ONCE(!PageUptodate(page)); + account_page_dirtied(page, mapping, memcg); + radix_tree_tag_set(&mapping->page_tree, + page_index(page), PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + + mem_cgroup_end_page_stat(memcg); + + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + return; +} + static int f2fs_set_data_page_dirty(struct page *page) { struct address_space *mapping = page->mapping; @@ -1688,7 +1886,8 @@ static int f2fs_set_data_page_dirty(struct page *page) trace_f2fs_set_page_dirty(page, DATA); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (f2fs_is_atomic_file(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { @@ -1703,7 +1902,7 @@ static int f2fs_set_data_page_dirty(struct page *page) } if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); update_dirty_page(inode, page); return 1; } @@ -1724,6 +1923,58 @@ static sector_t f2fs_bmap(struct address_space *mapping, sector_t block) return generic_block_bmap(mapping, block, get_data_block_bmap); } +#ifdef CONFIG_MIGRATION +#include + +int f2fs_migrate_page(struct address_space *mapping, + struct page *newpage, struct page *page, enum migrate_mode mode) +{ + int rc, extra_count; + struct f2fs_inode_info *fi = F2FS_I(mapping->host); + bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page); + + BUG_ON(PageWriteback(page)); + + /* migrating an atomic written page is safe with the inmem_lock hold */ + if (atomic_written && !mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + + /* + * A reference is expected if PagePrivate set when move mapping, + * however F2FS breaks this for maintaining dirty page counts when + * truncating pages. So here adjusting the 'extra_count' make it work. + */ + extra_count = (atomic_written ? 1 : 0) - page_has_private(page); + rc = migrate_page_move_mapping(mapping, newpage, + page, NULL, mode, extra_count); + if (rc != MIGRATEPAGE_SUCCESS) { + if (atomic_written) + mutex_unlock(&fi->inmem_lock); + return rc; + } + + if (atomic_written) { + struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) + if (cur->page == page) { + cur->page = newpage; + break; + } + mutex_unlock(&fi->inmem_lock); + put_page(page); + get_page(newpage); + } + + if (PagePrivate(page)) + SetPagePrivate(newpage); + set_page_private(newpage, page_private(page)); + + migrate_page_copy(newpage, page); + + return MIGRATEPAGE_SUCCESS; +} +#endif + const struct address_space_operations f2fs_dblock_aops = { .readpage = f2fs_read_data_page, .readpages = f2fs_read_data_pages, @@ -1736,4 +1987,7 @@ const struct address_space_operations f2fs_dblock_aops = { .releasepage = f2fs_release_page, .direct_IO = f2fs_direct_IO, .bmap = f2fs_bmap, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 24d6a51b48d1..1c35e80732e0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -38,23 +38,30 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->hit_rbtree = atomic64_read(&sbi->read_hit_rbtree); si->hit_total = si->hit_largest + si->hit_cached + si->hit_rbtree; si->total_ext = atomic64_read(&sbi->total_hit_ext); - si->ext_tree = sbi->total_ext_tree; + si->ext_tree = atomic_read(&sbi->total_ext_tree); + si->zombie_tree = atomic_read(&sbi->total_zombie_tree); si->ext_node = atomic_read(&sbi->total_ext_node); si->ndirty_node = get_pages(sbi, F2FS_DIRTY_NODES); si->ndirty_dent = get_pages(sbi, F2FS_DIRTY_DENTS); - si->ndirty_dirs = sbi->n_dirty_dirs; si->ndirty_meta = get_pages(sbi, F2FS_DIRTY_META); + si->ndirty_data = get_pages(sbi, F2FS_DIRTY_DATA); + si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); + si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; + si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_pages = get_pages(sbi, F2FS_WRITEBACK); + si->wb_bios = atomic_read(&sbi->nr_wb_bios); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); si->valid_count = valid_user_blocks(sbi); + si->discard_blks = discard_blocks(sbi); si->valid_node_count = valid_node_count(sbi); si->valid_inode_count = valid_inode_count(sbi); si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); si->free_segs = free_segments(sbi); @@ -105,7 +112,7 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * (1 << sbi->log_blocks_per_seg); + blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); @@ -140,6 +147,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); + si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; /* build sm */ si->base_mem += sizeof(struct f2fs_sm_info); @@ -148,7 +156,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->base_mem += sizeof(struct sit_info); si->base_mem += MAIN_SEGS(sbi) * sizeof(struct seg_entry); si->base_mem += f2fs_bitmap_size(MAIN_SEGS(sbi)); - si->base_mem += 3 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + si->base_mem += 2 * SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); + if (f2fs_discard_en(sbi)) + si->base_mem += SIT_VBLOCK_MAP_SIZE * MAIN_SEGS(sbi); si->base_mem += SIT_VBLOCK_MAP_SIZE; if (sbi->segs_per_sec > 1) si->base_mem += MAIN_SECS(sbi) * sizeof(struct sec_entry); @@ -161,7 +171,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build curseg */ si->base_mem += sizeof(struct curseg_info) * NR_CURSEG_TYPE; - si->base_mem += PAGE_CACHE_SIZE * NR_CURSEG_TYPE; + si->base_mem += PAGE_SIZE * NR_CURSEG_TYPE; /* build dirty segmap */ si->base_mem += sizeof(struct dirty_seglist_info); @@ -189,18 +199,18 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); si->cache_mem += si->inmem_pages * sizeof(struct inmem_pages); - si->cache_mem += sbi->n_dirty_dirs * sizeof(struct inode_entry); - for (i = 0; i <= UPDATE_INO; i++) + for (i = 0; i <= ORPHAN_INO; i++) si->cache_mem += sbi->im[i].ino_num * sizeof(struct ino_entry); - si->cache_mem += sbi->total_ext_tree * sizeof(struct extent_tree); + si->cache_mem += atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree); si->cache_mem += atomic_read(&sbi->total_ext_node) * sizeof(struct extent_node); si->page_mem = 0; npages = NODE_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; npages = META_MAPPING(sbi)->nrpages; - si->page_mem += (unsigned long long)npages << PAGE_CACHE_SHIFT; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } static int stat_show(struct seq_file *s, void *v) @@ -211,20 +221,24 @@ static int stat_show(struct seq_file *s, void *v) mutex_lock(&f2fs_stat_mutex); list_for_each_entry(si, &f2fs_stat_list, stat_list) { - char devname[BDEVNAME_SIZE]; - update_general_status(si->sbi); - seq_printf(s, "\n=====[ partition info(%s). #%d ]=====\n", - bdevname(si->sbi->sb->s_bdev, devname), i++); + seq_printf(s, "\n=====[ partition info(%pg). #%d, %s]=====\n", + si->sbi->sb->s_bdev, i++, + f2fs_readonly(si->sbi->sb) ? "RO": "RW"); seq_printf(s, "[SB: 1] [CP: 2] [SIT: %d] [NAT: %d] ", si->sit_area_segs, si->nat_area_segs); seq_printf(s, "[SSA: %d] [MAIN: %d", si->ssa_area_segs, si->main_area_segs); seq_printf(s, "(OverProv:%d Resv:%d)]\n\n", si->overp_segs, si->rsvd_segs); - seq_printf(s, "Utilization: %d%% (%d valid blocks)\n", - si->utilization, si->valid_count); + if (test_opt(si->sbi, DISCARD)) + seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n", + si->utilization, si->valid_count, si->discard_blks); + else + seq_printf(s, "Utilization: %u%% (%u valid blocks)\n", + si->utilization, si->valid_count); + seq_printf(s, " - Node: %u (Inode: %u, ", si->valid_node_count, si->valid_inode_count); seq_printf(s, "Other: %u)\n - Data: %u\n", @@ -236,6 +250,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); + seq_printf(s, " - Orphan Inode: %u\n", + si->orphans); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -269,7 +285,8 @@ static int stat_show(struct seq_file *s, void *v) si->dirty_count); seq_printf(s, " - Prefree: %d\n - Free: %d (%d)\n\n", si->prefree_count, si->free_segs, si->free_secs); - seq_printf(s, "CP calls: %d\n", si->cp_count); + seq_printf(s, "CP calls: %d (BG: %d)\n", + si->cp_count, si->bg_cp_count); seq_printf(s, "GC calls: %d (BG: %d)\n", si->call_count, si->bg_gc); seq_printf(s, " - data segments : %d (%d)\n", @@ -290,17 +307,21 @@ static int stat_show(struct seq_file *s, void *v) !si->total_ext ? 0 : div64_u64(si->hit_total * 100, si->total_ext), si->hit_total, si->total_ext); - seq_printf(s, " - Inner Struct Count: tree: %d, node: %d\n", - si->ext_tree, si->ext_node); + seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", + si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb: %4d\n", - si->inmem_pages, si->wb_pages); - seq_printf(s, " - nodes: %4d in %4d\n", + seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + si->inmem_pages, si->wb_bios); + seq_printf(s, " - nodes: %4lld in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4d in dirs:%4d\n", - si->ndirty_dent, si->ndirty_dirs); - seq_printf(s, " - meta: %4d in %4d\n", + seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); + seq_printf(s, " - datas: %4lld in files:%4d\n", + si->ndirty_data, si->ndirty_files); + seq_printf(s, " - meta: %4lld in %4d\n", si->ndirty_meta, si->meta_pages); + seq_printf(s, " - imeta: %4lld\n", + si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); seq_printf(s, " - free_nids: %9d\n", @@ -407,20 +428,23 @@ void f2fs_destroy_stats(struct f2fs_sb_info *sbi) kfree(si); } -void __init f2fs_create_root_stats(void) +int __init f2fs_create_root_stats(void) { struct dentry *file; f2fs_debugfs_root = debugfs_create_dir("f2fs", NULL); if (!f2fs_debugfs_root) - return; + return -ENOMEM; file = debugfs_create_file("status", S_IRUGO, f2fs_debugfs_root, NULL, &stat_fops); if (!file) { debugfs_remove(f2fs_debugfs_root); f2fs_debugfs_root = NULL; + return -ENOMEM; } + + return 0; } void f2fs_destroy_root_stats(void) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 60972a559685..e634a637c443 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -17,8 +17,8 @@ static unsigned long dir_blocks(struct inode *inode) { - return ((unsigned long long) (i_size_read(inode) + PAGE_CACHE_SIZE - 1)) - >> PAGE_CACHE_SHIFT; + return ((unsigned long long) (i_size_read(inode) + PAGE_SIZE - 1)) + >> PAGE_SHIFT; } static unsigned int dir_buckets(unsigned int level, int dir_level) @@ -37,7 +37,7 @@ static unsigned int bucket_blocks(unsigned int level) return 4; } -unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { +static unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_UNKNOWN] = DT_UNKNOWN, [F2FS_FT_REG_FILE] = DT_REG, [F2FS_FT_DIR] = DT_DIR, @@ -48,7 +48,6 @@ unsigned char f2fs_filetype_table[F2FS_FT_MAX] = { [F2FS_FT_SYMLINK] = DT_LNK, }; -#define S_SHIFT 12 static unsigned char f2fs_type_by_mode[S_IFMT >> S_SHIFT] = { [S_IFREG >> S_SHIFT] = F2FS_FT_REG_FILE, [S_IFDIR >> S_SHIFT] = F2FS_FT_DIR, @@ -64,6 +63,13 @@ void set_de_type(struct f2fs_dir_entry *de, umode_t mode) de->file_type = f2fs_type_by_mode[(mode & S_IFMT) >> S_SHIFT]; } +unsigned char get_de_type(struct f2fs_dir_entry *de) +{ + if (de->file_type < F2FS_FT_MAX) + return f2fs_filetype_table[de->file_type]; + return DT_UNKNOWN; +} + static unsigned long dir_block_index(unsigned int level, int dir_level, unsigned int idx) { @@ -77,7 +83,7 @@ static unsigned long dir_block_index(unsigned int level, } static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, - struct f2fs_filename *fname, + struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct page **res_page) @@ -95,23 +101,18 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, else kunmap(dentry_page); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(F2FS_P_SB(dentry_page), d.max < 0); return de; } -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, f2fs_hash_t namehash, int *max_slots, struct f2fs_dentry_ptr *d) { struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); - struct f2fs_str *name = &fname->disk_name; + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -124,37 +125,28 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, de = &d->dentry[bit_pos]; - if (de->hash_code != namehash) - goto not_match; + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + /* encrypted case */ de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (unlikely(!name->name)) { - if (fname->usr_fname->name[0] == '_') { - if (de_name.len > 32 && - !memcmp(de_name.name + ((de_name.len - 17) & ~15), - fname->crypto_buf.name + 8, 16)) - goto found; - goto not_match; - } - name->name = fname->crypto_buf.name; - name->len = fname->crypto_buf.len; - } -#endif - if (de_name.len == name->len && - !memcmp(de_name.name, name->name, name->len)) + /* show encrypted name */ + if (fname->hash) { + if (de->hash_code == fname->hash) + goto found; + } else if (de_name.len == name->len && + de->hash_code == namehash && + !memcmp(de_name.name, name->name, name->len)) goto found; -not_match: + if (max_slots && max_len > *max_slots) *max_slots = max_len; max_len = 0; - /* remain bug on condition */ - if (unlikely(!de->name_len)) - d->max = -1; - bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); } @@ -167,7 +159,7 @@ struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *fname, static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, - struct f2fs_filename *fname, + struct fscrypt_name *fname, struct page **res_page) { struct qstr name = FSTR_TO_QSTR(&fname->disk_name); @@ -180,9 +172,10 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, int max_slots; f2fs_hash_t namehash; - namehash = f2fs_dentry_hash(&name, fname); - - f2fs_bug_on(F2FS_I_SB(dir), level > MAX_DIR_HASH_DEPTH); + if(fname->hash) + namehash = cpu_to_le32(fname->hash); + else + namehash = f2fs_dentry_hash(&name); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -195,8 +188,13 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, /* no need to allocate new dentry pages to all the indices */ dentry_page = find_data_page(dir, bidx); if (IS_ERR(dentry_page)) { - room = true; - continue; + if (PTR_ERR(dentry_page) == -ENOENT) { + room = true; + continue; + } else { + *res_page = dentry_page; + break; + } } de = find_in_block(dentry_page, fname, namehash, &max_slots, @@ -217,6 +215,44 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, return de; } +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page) +{ + unsigned long npages = dir_blocks(dir); + struct f2fs_dir_entry *de = NULL; + unsigned int max_depth; + unsigned int level; + + if (f2fs_has_inline_dentry(dir)) { + *res_page = NULL; + de = find_in_inline_dir(dir, fname, res_page); + goto out; + } + + if (npages == 0) { + *res_page = NULL; + goto out; + } + + max_depth = F2FS_I(dir)->i_current_depth; + if (unlikely(max_depth > MAX_DIR_HASH_DEPTH)) { + f2fs_msg(F2FS_I_SB(dir)->sb, KERN_WARNING, + "Corrupted max_depth of %lu: %u", + dir->i_ino, max_depth); + max_depth = MAX_DIR_HASH_DEPTH; + f2fs_i_depth_write(dir, max_depth); + } + + for (level = 0; level < max_depth; level++) { + *res_page = NULL; + de = find_in_level(dir, level, fname, res_page); + if (de || IS_ERR(*res_page)) + break; + } +out: + return de; +} + /* * Find an entry in the specified directory with the wanted name. * It returns the page where the entry was found (as a parameter - res_page), @@ -224,72 +260,42 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - struct qstr *child, struct page **res_page) + const struct qstr *child, struct page **res_page) { - unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; - unsigned int max_depth; - unsigned int level; - struct f2fs_filename fname; + struct fscrypt_name fname; int err; - *res_page = NULL; - - err = f2fs_fname_setup_filename(dir, child, 1, &fname); - if (err) + err = fscrypt_setup_filename(dir, child, 1, &fname); + if (err) { + *res_page = ERR_PTR(err); return NULL; - - if (f2fs_has_inline_dentry(dir)) { - de = find_in_inline_dir(dir, &fname, res_page); - goto out; } - if (npages == 0) - goto out; + de = __f2fs_find_entry(dir, &fname, res_page); - max_depth = F2FS_I(dir)->i_current_depth; - - for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, &fname, res_page); - if (de) - break; - } -out: - f2fs_fname_free_filename(&fname); + fscrypt_free_filename(&fname); return de; } struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct page *page; - struct f2fs_dir_entry *de; - struct f2fs_dentry_block *dentry_blk; + struct qstr dotdot = QSTR_INIT("..", 2); - if (f2fs_has_inline_dentry(dir)) - return f2fs_parent_inline_dir(dir, p); - - page = get_lock_data_page(dir, 0, false); - if (IS_ERR(page)) - return NULL; - - dentry_blk = kmap(page); - de = &dentry_blk->dentry[1]; - *p = page; - unlock_page(page); - return de; + return f2fs_find_entry(dir, &dotdot, p); } -ino_t f2fs_inode_by_name(struct inode *dir, struct qstr *qstr) +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page) { ino_t res = 0; struct f2fs_dir_entry *de; - struct page *page; - de = f2fs_find_entry(dir, qstr, &page); + de = f2fs_find_entry(dir, qstr, page); if (de) { res = le32_to_cpu(de->ino); - f2fs_dentry_kunmap(dir, page); - f2fs_put_page(page, 0); + f2fs_dentry_kunmap(dir, *page); + f2fs_put_page(*page, 0); } return res; @@ -300,14 +306,14 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; lock_page(page); - f2fs_wait_on_page_writeback(page, type); + f2fs_wait_on_page_writeback(page, type, true); de->ino = cpu_to_le32(inode->i_ino); set_de_type(de, inode->i_mode); f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + dir->i_mtime = dir->i_ctime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -315,7 +321,7 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); /* copy name info. to this inode page */ ri = F2FS_INODE(ipage); @@ -345,24 +351,14 @@ int update_dent_inode(struct inode *inode, struct inode *to, void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { - struct f2fs_dir_entry *de; + struct qstr dot = QSTR_INIT(".", 1); + struct qstr dotdot = QSTR_INIT("..", 2); - de = &d->dentry[0]; - de->name_len = cpu_to_le16(1); - de->hash_code = 0; - de->ino = cpu_to_le32(inode->i_ino); - memcpy(d->filename[0], ".", 1); - set_de_type(de, inode->i_mode); + /* update dirent of "." */ + f2fs_update_dentry(inode->i_ino, inode->i_mode, d, &dot, 0, 0); - de = &d->dentry[1]; - de->hash_code = 0; - de->name_len = cpu_to_le16(2); - de->ino = cpu_to_le32(parent->i_ino); - memcpy(d->filename[1], "..", 2); - set_de_type(de, parent->i_mode); - - test_and_set_bit_le(0, (void *)d->bitmap); - test_and_set_bit_le(1, (void *)d->bitmap); + /* update dirent of ".." */ + f2fs_update_dentry(parent->i_ino, parent->i_mode, d, &dotdot, 0, 1); } static int make_empty_dir(struct inode *inode, @@ -392,32 +388,38 @@ static int make_empty_dir(struct inode *inode, } struct page *init_inode_metadata(struct inode *inode, struct inode *dir, - const struct qstr *name, struct page *dpage) + const struct qstr *new_name, const struct qstr *orig_name, + struct page *dpage) { struct page *page; int err; - if (is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { + if (is_inode_flag_set(inode, FI_NEW_INODE)) { page = new_inode_page(inode); if (IS_ERR(page)) return page; if (S_ISDIR(inode->i_mode)) { + /* in order to handle error case */ + get_page(page); err = make_empty_dir(inode, dir, page); - if (err) - goto error; + if (err) { + lock_page(page); + goto put_error; + } + put_page(page); } err = f2fs_init_acl(inode, dir, page, dpage); if (err) goto put_error; - err = f2fs_init_security(inode, dir, name, page); + err = f2fs_init_security(inode, dir, orig_name, page); if (err) goto put_error; if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) { - err = f2fs_inherit_context(dir, inode, page); + err = fscrypt_inherit_context(dir, inode, page, false); if (err) goto put_error; } @@ -429,14 +431,14 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (name) - init_dent_inode(name, page); + if (new_name) + init_dent_inode(new_name, page); /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ - if (is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) { + if (is_inode_flag_set(inode, FI_INC_LINK)) { file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, @@ -444,41 +446,33 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, */ if (inode->i_nlink == 0) remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); - inc_nlink(inode); + f2fs_i_links_write(inode, true); } return page; put_error: + clear_nlink(inode); + update_inode(inode, page); f2fs_put_page(page, 1); -error: - /* once the failed inode becomes a bad inode, i_mode is S_IFREG */ - truncate_inode_pages(&inode->i_data, 0); - truncate_blocks(inode, 0, false); - remove_dirty_dir_inode(inode); - remove_inode_page(inode); return ERR_PTR(err); } void update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth) { - if (inode && is_inode_flag_set(F2FS_I(inode), FI_NEW_INODE)) { - if (S_ISDIR(inode->i_mode)) { - inc_nlink(dir); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + if (inode && is_inode_flag_set(inode, FI_NEW_INODE)) { + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, true); + clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = CURRENT_TIME; - mark_inode_dirty(dir); + f2fs_mark_inode_dirty_sync(dir); - if (F2FS_I(dir)->i_current_depth != current_depth) { - F2FS_I(dir)->i_current_depth = current_depth; - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } + if (F2FS_I(dir)->i_current_depth != current_depth) + f2fs_i_depth_write(dir, current_depth); - if (inode && is_inode_flag_set(F2FS_I(inode), FI_INC_LINK)) - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + if (inode && is_inode_flag_set(inode, FI_INC_LINK)) + clear_inode_flag(inode, FI_INC_LINK); } int room_for_filename(const void *bitmap, int slots, int max_slots) @@ -515,15 +509,16 @@ void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, memcpy(d->filename[bit_pos], name->name, name->len); de->ino = cpu_to_le32(ino); set_de_type(de, mode); - for (i = 0; i < slots; i++) - test_and_set_bit_le(bit_pos + i, (void *)d->bitmap); + for (i = 0; i < slots; i++) { + __set_bit_le(bit_pos + i, (void *)d->bitmap); + /* avoid wrong garbage data for readdir */ + if (i) + (de + i)->name_len = 0; + } } -/* - * Caller should grab and release a rwsem by calling f2fs_lock_op() and - * f2fs_unlock_op(). - */ -int __f2fs_add_link(struct inode *dir, const struct qstr *name, +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, struct inode *inode, nid_t ino, umode_t mode) { unsigned int bit_pos; @@ -536,28 +531,11 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; struct page *page = NULL; - struct f2fs_filename fname; - struct qstr new_name; - int slots, err; - - err = f2fs_fname_setup_filename(dir, name, 0, &fname); - if (err) - return err; - - new_name.name = fname_name(&fname); - new_name.len = fname_len(&fname); - - if (f2fs_has_inline_dentry(dir)) { - err = f2fs_add_inline_entry(dir, &new_name, inode, ino, mode); - if (!err || err != -EAGAIN) - goto out; - else - err = 0; - } + int slots, err = 0; level = 0; - slots = GET_DENTRY_SLOTS(new_name.len); - dentry_hash = f2fs_dentry_hash(&new_name, NULL); + slots = GET_DENTRY_SLOTS(new_name->len); + dentry_hash = f2fs_dentry_hash(new_name); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -566,10 +544,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } start: - if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) { - err = -ENOSPC; - goto out; - } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + return -ENOSPC; +#endif + if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) + return -ENOSPC; /* Increase the depth, if required */ if (level == current_depth) @@ -583,10 +563,8 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, for (block = bidx; block <= (bidx + nblock - 1); block++) { dentry_page = get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); - goto out; - } + if (IS_ERR(dentry_page)) + return PTR_ERR(dentry_page); dentry_blk = kmap(dentry_page); bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, @@ -602,11 +580,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA); + f2fs_wait_on_page_writeback(dentry_page, DATA, true); if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, &new_name, NULL); + page = init_inode_metadata(inode, dir, new_name, + orig_name, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; @@ -616,14 +595,12 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, } make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); - f2fs_update_dentry(ino, mode, &d, &new_name, dentry_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); if (inode) { - /* we don't need to mark_inode_dirty now */ - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -632,14 +609,49 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, if (inode) up_write(&F2FS_I(inode)->i_sem); - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode_page(dir); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } kunmap(dentry_page); f2fs_put_page(dentry_page, 1); -out: - f2fs_fname_free_filename(&fname); + + return err; +} + +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct qstr new_name; + int err = -EAGAIN; + + new_name.name = fname_name(fname); + new_name.len = fname_len(fname); + + if (f2fs_has_inline_dentry(dir)) + err = f2fs_add_inline_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + if (err == -EAGAIN) + err = f2fs_add_regular_entry(dir, &new_name, fname->usr_fname, + inode, ino, mode); + + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + return err; +} + +/* + * Caller should grab and release a rwsem by calling f2fs_lock_op() and + * f2fs_unlock_op(). + */ +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode) +{ + struct fscrypt_name fname; + int err; + + err = fscrypt_setup_filename(dir, name, 0, &fname); + if (err) + return err; + + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + + fscrypt_free_filename(&fname); return err; } @@ -649,46 +661,39 @@ int f2fs_do_tmpfile(struct inode *inode, struct inode *dir) int err = 0; down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, NULL, NULL); + page = init_inode_metadata(inode, dir, NULL, NULL, NULL); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } - /* we don't need to mark_inode_dirty now */ - update_inode(inode, page); f2fs_put_page(page, 1); - clear_inode_flag(F2FS_I(inode), FI_NEW_INODE); + clear_inode_flag(inode, FI_NEW_INODE); fail: up_write(&F2FS_I(inode)->i_sem); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return err; } -void f2fs_drop_nlink(struct inode *dir, struct inode *inode, struct page *page) +void f2fs_drop_nlink(struct inode *dir, struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); down_write(&F2FS_I(inode)->i_sem); - if (S_ISDIR(inode->i_mode)) { - drop_nlink(dir); - if (page) - update_inode(dir, page); - else - update_inode_page(dir); - } + if (S_ISDIR(inode->i_mode)) + f2fs_i_links_write(dir, false); inode->i_ctime = CURRENT_TIME; - drop_nlink(inode); + f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { - drop_nlink(inode); - i_size_write(inode, 0); + f2fs_i_links_write(inode, false); + f2fs_i_size_write(inode, 0); } up_write(&F2FS_I(inode)->i_sem); - update_inode_page(inode); if (inode->i_nlink == 0) - add_orphan_inode(sbi, inode->i_ino); + add_orphan_inode(inode); else release_orphan_inode(sbi); } @@ -705,11 +710,13 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); int i; + f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); + if (f2fs_has_inline_dentry(dir)) return f2fs_delete_inline_entry(dentry, page, dir, inode); lock_page(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; @@ -724,9 +731,10 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, NULL); + f2fs_drop_nlink(dir, inode); if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { @@ -777,12 +785,12 @@ bool f2fs_empty_dir(struct inode *dir) } bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, - unsigned int start_pos, struct f2fs_str *fstr) + unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; unsigned int bit_pos; struct f2fs_dir_entry *de = NULL; - struct f2fs_str de_name = FSTR_INIT(NULL, 0); + struct fscrypt_str de_name = FSTR_INIT(NULL, 0); bit_pos = ((unsigned long)ctx->pos % d->max); @@ -792,10 +800,13 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, break; de = &d->dentry[bit_pos]; - if (de->file_type < F2FS_FT_MAX) - d_type = f2fs_filetype_table[de->file_type]; - else - d_type = DT_UNKNOWN; + if (de->name_len == 0) { + bit_pos++; + ctx->pos = start_pos + bit_pos; + continue; + } + + d_type = get_de_type(de); de_name.name = d->filename[bit_pos]; de_name.len = le16_to_cpu(de->name_len); @@ -804,15 +815,9 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, int save_len = fstr->len; int ret; - de_name.name = kmalloc(de_name.len, GFP_NOFS); - if (!de_name.name) - return false; - - memcpy(de_name.name, d->filename[bit_pos], de_name.len); - - ret = f2fs_fname_disk_to_usr(d->inode, &de->hash_code, - &de_name, fstr); - kfree(de_name.name); + ret = fscrypt_fname_disk_to_usr(d->inode, + (u32)de->hash_code, 0, + &de_name, fstr); if (ret < 0) return true; @@ -839,16 +844,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct file_ra_state *ra = &file->f_ra; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); struct f2fs_dentry_ptr d; - struct f2fs_str fstr = FSTR_INIT(NULL, 0); + struct fscrypt_str fstr = FSTR_INIT(NULL, 0); int err = 0; if (f2fs_encrypted_inode(inode)) { - err = f2fs_get_encryption_info(inode); - if (err) + err = fscrypt_get_encryption_info(inode); + if (err && err != -ENOKEY) return err; - err = f2fs_fname_crypto_alloc_buffer(inode, F2FS_NAME_LEN, - &fstr); + err = fscrypt_fname_alloc_buffer(inode, F2FS_NAME_LEN, &fstr); if (err < 0) return err; } @@ -865,36 +869,47 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) for (; n < npages; n++) { dentry_page = get_lock_data_page(inode, n, false); - if (IS_ERR(dentry_page)) - continue; + if (IS_ERR(dentry_page)) { + err = PTR_ERR(dentry_page); + if (err == -ENOENT) + continue; + else + goto out; + } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) - goto stop; + if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + kunmap(dentry_page); + f2fs_put_page(dentry_page, 1); + break; + } ctx->pos = (n + 1) * NR_DENTRY_IN_BLOCK; kunmap(dentry_page); f2fs_put_page(dentry_page, 1); - dentry_page = NULL; - } -stop: - if (dentry_page && !IS_ERR(dentry_page)) { - kunmap(dentry_page); - f2fs_put_page(dentry_page, 1); } + err = 0; out: - f2fs_fname_crypto_free_buffer(&fstr); + fscrypt_fname_free_buffer(&fstr); return err; } +static int f2fs_dir_open(struct inode *inode, struct file *filp) +{ + if (f2fs_encrypted_inode(inode)) + return fscrypt_get_encryption_info(inode) ? -EACCES : 0; + return 0; +} + const struct file_operations f2fs_dir_operations = { .llseek = generic_file_llseek, .read = generic_read_dir, .iterate = f2fs_readdir, .fsync = f2fs_sync_file, + .open = f2fs_dir_open, .unlocked_ioctl = f2fs_ioctl, #ifdef CONFIG_COMPAT .compat_ioctl = f2fs_compat_ioctl, diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 7ddba812e11b..2b06d4fcd954 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -33,10 +33,11 @@ static struct extent_node *__attach_extent_node(struct f2fs_sb_info *sbi, en->ei = *ei; INIT_LIST_HEAD(&en->list); + en->et = et; rb_link_node(&en->rb_node, parent, p); rb_insert_color(&en->rb_node, &et->root); - et->count++; + atomic_inc(&et->node_cnt); atomic_inc(&sbi->total_ext_node); return en; } @@ -45,11 +46,29 @@ static void __detach_extent_node(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_node *en) { rb_erase(&en->rb_node, &et->root); - et->count--; + atomic_dec(&et->node_cnt); atomic_dec(&sbi->total_ext_node); if (et->cached_en == en) et->cached_en = NULL; + kmem_cache_free(extent_node_slab, en); +} + +/* + * Flow to release an extent_node: + * 1. list_del_init + * 2. __detach_extent_node + * 3. kmem_cache_free. + */ +static void __release_extent_node(struct f2fs_sb_info *sbi, + struct extent_tree *et, struct extent_node *en) +{ + spin_lock(&sbi->extent_lock); + f2fs_bug_on(sbi, list_empty(&en->list)); + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); } static struct extent_tree *__grab_extent_tree(struct inode *inode) @@ -68,11 +87,13 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) et->root = RB_ROOT; et->cached_en = NULL; rwlock_init(&et->lock); - atomic_set(&et->refcount, 0); - et->count = 0; - sbi->total_ext_tree++; + INIT_LIST_HEAD(&et->list); + atomic_set(&et->node_cnt, 0); + atomic_inc(&sbi->total_ext_tree); + } else { + atomic_dec(&sbi->total_zombie_tree); + list_del_init(&et->list); } - atomic_inc(&et->refcount); up_write(&sbi->extent_tree_lock); /* never died until evict_inode */ @@ -127,32 +148,21 @@ static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, } static unsigned int __free_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, bool free_all) + struct extent_tree *et) { struct rb_node *node, *next; struct extent_node *en; - unsigned int count = et->count; + unsigned int count = atomic_read(&et->node_cnt); node = rb_first(&et->root); while (node) { next = rb_next(node); en = rb_entry(node, struct extent_node, rb_node); - - if (free_all) { - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) - list_del_init(&en->list); - spin_unlock(&sbi->extent_lock); - } - - if (free_all || list_empty(&en->list)) { - __detach_extent_node(sbi, et, en); - kmem_cache_free(extent_node_slab, en); - } + __release_extent_node(sbi, et, en); node = next; } - return count - et->count; + return count - atomic_read(&et->node_cnt); } static void __drop_largest_extent(struct inode *inode, @@ -160,38 +170,38 @@ static void __drop_largest_extent(struct inode *inode, { struct extent_info *largest = &F2FS_I(inode)->extent_tree->largest; - if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) + if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; + f2fs_mark_inode_dirty_sync(inode); + } } -void f2fs_drop_largest_extent(struct inode *inode, pgoff_t fofs) -{ - if (!f2fs_may_extent_tree(inode)) - return; - - __drop_largest_extent(inode, fofs, 1); -} - -void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +/* return true, if inode page is changed */ +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; struct extent_node *en; struct extent_info ei; - if (!f2fs_may_extent_tree(inode)) - return; + if (!f2fs_may_extent_tree(inode)) { + /* drop largest extent */ + if (i_ext && i_ext->len) { + i_ext->len = 0; + return true; + } + return false; + } et = __grab_extent_tree(inode); - if (!i_ext || le32_to_cpu(i_ext->len) < F2FS_MIN_EXTENT_LEN) - return; + if (!i_ext || !i_ext->len) + return false; - set_extent_info(&ei, le32_to_cpu(i_ext->fofs), - le32_to_cpu(i_ext->blk), le32_to_cpu(i_ext->len)); + get_extent_info(&ei, i_ext); write_lock(&et->lock); - if (et->count) + if (atomic_read(&et->node_cnt)) goto out; en = __init_extent_tree(sbi, et, &ei); @@ -202,6 +212,7 @@ void f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) } out: write_unlock(&et->lock); + return false; } static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, @@ -230,9 +241,10 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, if (en) { *ei = en->ei; spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) + if (!list_empty(&en->list)) { list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; + et->cached_en = en; + } spin_unlock(&sbi->extent_lock); ret = true; } @@ -325,12 +337,12 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, return en; } -static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, +static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, - struct extent_node **den, struct extent_node *prev_ex, struct extent_node *next_ex) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_node *en = NULL; if (prev_ex && __is_back_mergeable(ei, &prev_ex->ei)) { @@ -340,28 +352,34 @@ static struct extent_node *__try_merge_extent_node(struct f2fs_sb_info *sbi, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) { - __detach_extent_node(sbi, et, prev_ex); - *den = prev_ex; - } + if (en) + __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; en = next_ex; } - if (en) { - __try_update_largest_extent(et, en); + if (!en) + return NULL; + + __try_update_largest_extent(inode, et, en); + + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); et->cached_en = en; } + spin_unlock(&sbi->extent_lock); return en; } -static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, +static struct extent_node *__insert_extent_tree(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct rb_node **insert_p, struct rb_node *insert_parent) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct rb_node **p = &et->root.rb_node; struct rb_node *parent = NULL; struct extent_node *en = NULL; @@ -388,8 +406,13 @@ static struct extent_node *__insert_extent_tree(struct f2fs_sb_info *sbi, if (!en) return NULL; - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); + + /* update in global extent list */ + spin_lock(&sbi->extent_lock); + list_add_tail(&en->list, &sbi->extent_list); et->cached_en = en; + spin_unlock(&sbi->extent_lock); return en; } @@ -412,7 +435,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, write_lock(&et->lock); - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) { + if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); return false; } @@ -454,7 +477,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, set_extent_info(&ei, end, end - dei.fofs + dei.blk, org_end - end); - en1 = __insert_extent_tree(sbi, et, &ei, + en1 = __insert_extent_tree(inode, et, &ei, NULL, NULL); next_en = en1; } else { @@ -475,9 +498,9 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, } if (parts) - __try_update_largest_extent(et, en); + __try_update_largest_extent(inode, et, en); else - __detach_extent_node(sbi, et, en); + __release_extent_node(sbi, et, en); /* * if original extent is split into zero or two parts, extent @@ -488,58 +511,28 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, insert_p = NULL; insert_parent = NULL; } - - /* update in global extent list */ - spin_lock(&sbi->extent_lock); - if (!parts && !list_empty(&en->list)) - list_del(&en->list); - if (en1) - list_add_tail(&en1->list, &sbi->extent_list); - spin_unlock(&sbi->extent_lock); - - /* release extent node */ - if (!parts) - kmem_cache_free(extent_node_slab, en); - en = next_en; } /* 3. update extent in extent cache */ if (blkaddr) { - struct extent_node *den = NULL; set_extent_info(&ei, fofs, blkaddr, len); - en1 = __try_merge_extent_node(sbi, et, &ei, &den, - prev_en, next_en); - if (!en1) - en1 = __insert_extent_tree(sbi, et, &ei, + if (!__try_merge_extent_node(inode, et, &ei, prev_en, next_en)) + __insert_extent_tree(inode, et, &ei, insert_p, insert_parent); /* give up extent_cache, if split and small updates happen */ if (dei.len >= 1 && prev.len < F2FS_MIN_EXTENT_LEN && et->largest.len < F2FS_MIN_EXTENT_LEN) { - et->largest.len = 0; - set_inode_flag(F2FS_I(inode), FI_NO_EXTENT); + __drop_largest_extent(inode, 0, UINT_MAX); + set_inode_flag(inode, FI_NO_EXTENT); } - - spin_lock(&sbi->extent_lock); - if (en1) { - if (list_empty(&en1->list)) - list_add_tail(&en1->list, &sbi->extent_list); - else - list_move_tail(&en1->list, &sbi->extent_list); - } - if (den && !list_empty(&den->list)) - list_del(&den->list); - spin_unlock(&sbi->extent_lock); - - if (den) - kmem_cache_free(extent_node_slab, den); } - if (is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) - __free_extent_tree(sbi, et, true); + if (is_inode_flag_set(inode, FI_NO_EXTENT)) + __free_extent_tree(sbi, et); write_unlock(&et->lock); @@ -548,46 +541,42 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) { - struct extent_tree *treevec[EXT_TREE_VEC_SIZE]; - struct extent_node *en, *tmp; - unsigned long ino = F2FS_ROOT_INO(sbi); - struct radix_tree_root *root = &sbi->extent_tree_root; - unsigned int found; + struct extent_tree *et, *next; + struct extent_node *en; unsigned int node_cnt = 0, tree_cnt = 0; int remained; if (!test_opt(sbi, EXTENT_CACHE)) return 0; + if (!atomic_read(&sbi->total_zombie_tree)) + goto free_node; + if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - if (!atomic_read(&et->refcount)) { - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, true); - write_unlock(&et->lock); - - radix_tree_delete(root, et->ino); - kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; - tree_cnt++; - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } + list_for_each_entry_safe(et, next, &sbi->zombie_list, list) { + if (atomic_read(&et->node_cnt)) { + write_lock(&et->lock); + node_cnt += __free_extent_tree(sbi, et); + write_unlock(&et->lock); } + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); + list_del_init(&et->list); + radix_tree_delete(&sbi->extent_tree_root, et->ino); + kmem_cache_free(extent_tree_slab, et); + atomic_dec(&sbi->total_ext_tree); + atomic_dec(&sbi->total_zombie_tree); + tree_cnt++; + + if (node_cnt + tree_cnt >= nr_shrink) + goto unlock_out; + cond_resched(); } up_write(&sbi->extent_tree_lock); +free_node: /* 2. remove LRU extent entries */ if (!down_write_trylock(&sbi->extent_tree_lock)) goto out; @@ -595,34 +584,29 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) remained = nr_shrink - (node_cnt + tree_cnt); spin_lock(&sbi->extent_lock); - list_for_each_entry_safe(en, tmp, &sbi->extent_list, list) { - if (!remained--) + for (; remained > 0; remained--) { + if (list_empty(&sbi->extent_list)) break; + en = list_first_entry(&sbi->extent_list, + struct extent_node, list); + et = en->et; + if (!write_trylock(&et->lock)) { + /* refresh this extent node's position in extent list */ + list_move_tail(&en->list, &sbi->extent_list); + continue; + } + list_del_init(&en->list); + spin_unlock(&sbi->extent_lock); + + __detach_extent_node(sbi, et, en); + + write_unlock(&et->lock); + node_cnt++; + spin_lock(&sbi->extent_lock); } spin_unlock(&sbi->extent_lock); - /* - * reset ino for searching victims from beginning of global extent tree. - */ - ino = F2FS_ROOT_INO(sbi); - - while ((found = radix_tree_gang_lookup(root, - (void **)treevec, ino, EXT_TREE_VEC_SIZE))) { - unsigned i; - - ino = treevec[found - 1]->ino + 1; - for (i = 0; i < found; i++) { - struct extent_tree *et = treevec[i]; - - write_lock(&et->lock); - node_cnt += __free_extent_tree(sbi, et, false); - write_unlock(&et->lock); - - if (node_cnt + tree_cnt >= nr_shrink) - goto unlock_out; - } - } unlock_out: up_write(&sbi->extent_tree_lock); out: @@ -637,16 +621,29 @@ unsigned int f2fs_destroy_extent_node(struct inode *inode) struct extent_tree *et = F2FS_I(inode)->extent_tree; unsigned int node_cnt = 0; - if (!et) + if (!et || !atomic_read(&et->node_cnt)) return 0; write_lock(&et->lock); - node_cnt = __free_extent_tree(sbi, et, true); + node_cnt = __free_extent_tree(sbi, et); write_unlock(&et->lock); return node_cnt; } +void f2fs_drop_extent_tree(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct extent_tree *et = F2FS_I(inode)->extent_tree; + + set_inode_flag(inode, FI_NO_EXTENT); + + write_lock(&et->lock); + __free_extent_tree(sbi, et); + __drop_largest_extent(inode, 0, UINT_MAX); + write_unlock(&et->lock); +} + void f2fs_destroy_extent_tree(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -656,8 +653,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (!et) return; - if (inode->i_nlink && !is_bad_inode(inode) && et->count) { - atomic_dec(&et->refcount); + if (inode->i_nlink && !is_bad_inode(inode) && + atomic_read(&et->node_cnt)) { + down_write(&sbi->extent_tree_lock); + list_add_tail(&et->list, &sbi->zombie_list); + atomic_inc(&sbi->total_zombie_tree); + up_write(&sbi->extent_tree_lock); return; } @@ -666,11 +667,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) /* delete extent tree entry in radix tree */ down_write(&sbi->extent_tree_lock); - atomic_dec(&et->refcount); - f2fs_bug_on(sbi, atomic_read(&et->refcount) || et->count); + f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); - sbi->total_ext_tree--; + atomic_dec(&sbi->total_ext_tree); up_write(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -689,20 +689,20 @@ bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, void f2fs_update_extent_cache(struct dnode_of_data *dn) { - struct f2fs_inode_info *fi = F2FS_I(dn->inode); pgoff_t fofs; + block_t blkaddr; if (!f2fs_may_extent_tree(dn->inode)) return; - f2fs_bug_on(F2FS_I_SB(dn->inode), dn->data_blkaddr == NEW_ADDR); + if (dn->data_blkaddr == NEW_ADDR) + blkaddr = NULL_ADDR; + else + blkaddr = dn->data_blkaddr; - - fofs = start_bidx_of_node(ofs_of_node(dn->node_page), fi) + - dn->ofs_in_node; - - if (f2fs_update_extent_tree_range(dn->inode, fofs, dn->data_blkaddr, 1)) - sync_inode_page(dn); + fofs = start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + dn->ofs_in_node; + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, 1); } void f2fs_update_extent_cache_range(struct dnode_of_data *dn, @@ -712,8 +712,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, if (!f2fs_may_extent_tree(dn->inode)) return; - if (f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len)) - sync_inode_page(dn); + f2fs_update_extent_tree_range(dn->inode, fofs, blkaddr, len); } void init_extent_cache_info(struct f2fs_sb_info *sbi) @@ -722,7 +721,9 @@ void init_extent_cache_info(struct f2fs_sb_info *sbi) init_rwsem(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); - sbi->total_ext_tree = 0; + atomic_set(&sbi->total_ext_tree, 0); + INIT_LIST_HEAD(&sbi->zombie_list); + atomic_set(&sbi->total_zombie_tree, 0); atomic_set(&sbi->total_ext_node, 0); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 2871576fbca4..af293e84e5cd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -21,10 +21,12 @@ #include #include #include +#include +#include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) -#define f2fs_down_write(x, y) down_write_nest_lock(x, y) #else #define f2fs_bug_on(sbi, condition) \ do { \ @@ -33,7 +35,30 @@ set_sbi_flag(sbi, SBI_NEED_FSCK); \ } \ } while (0) -#define f2fs_down_write(x, y) down_write(x) +#endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION +enum { + FAULT_KMALLOC, + FAULT_PAGE_ALLOC, + FAULT_ALLOC_NID, + FAULT_ORPHAN, + FAULT_BLOCK, + FAULT_DIR_DEPTH, + FAULT_EVICT_INODE, + FAULT_IO, + FAULT_CHECKPOINT, + FAULT_MAX, +}; + +struct f2fs_fault_info { + atomic_t inject_ops; + unsigned int inject_rate; + unsigned int inject_type; +}; + +extern char *fault_name[FAULT_MAX]; +#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) #endif /* @@ -54,6 +79,10 @@ #define F2FS_MOUNT_FASTBOOT 0x00001000 #define F2FS_MOUNT_EXTENT_CACHE 0x00002000 #define F2FS_MOUNT_FORCE_FG_GC 0x00004000 +#define F2FS_MOUNT_DATA_FLUSH 0x00008000 +#define F2FS_MOUNT_FAULT_INJECTION 0x00010000 +#define F2FS_MOUNT_ADAPTIVE 0x00020000 +#define F2FS_MOUNT_LFS 0x00040000 #define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) #define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) @@ -74,6 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_HMSMR 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -82,25 +112,30 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) -#define CRCPOLY_LE 0xedb88320 - -static inline __u32 f2fs_crc32(void *buf, size_t len) +/** + * wq_has_sleeper - check if there are any waiting processes + * @wq: wait queue head + * + * Returns true if wq has waiting processes + * + * Please refer to the comment for waitqueue_active. + */ +static inline bool wq_has_sleeper(wait_queue_head_t *wq) { - unsigned char *p = (unsigned char *)buf; - __u32 crc = F2FS_SUPER_MAGIC; - int i; - - while (len--) { - crc ^= *p++; - for (i = 0; i < 8; i++) - crc = (crc >> 1) ^ ((crc & 1) ? CRCPOLY_LE : 0); - } - return crc; + /* + * We need to be sure we are in sync with the + * add_wait_queue modifications to the wait queue. + * + * This memory barrier should be paired with one on the + * waiting side. + */ + smp_mb(); + return waitqueue_active(wq); } -static inline bool f2fs_crc_valid(__u32 blk_crc, void *buf, size_t buf_size) +static inline void inode_nohighmem(struct inode *inode) { - return f2fs_crc32(buf, buf_size) == blk_crc; + mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } /* @@ -119,12 +154,13 @@ enum { CP_DISCARD, }; -#define DEF_BATCHED_TRIM_SECTIONS 32 +#define DEF_BATCHED_TRIM_SECTIONS 2 #define BATCHED_TRIM_SEGMENTS(sbi) \ (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) #define DEF_CP_INTERVAL 60 /* 60 secs */ +#define DEF_IDLE_INTERVAL 5 /* 5 secs */ struct cp_control { int reason; @@ -158,13 +194,7 @@ struct ino_entry { nid_t ino; /* inode number */ }; -/* - * for the list of directory inodes or gc inodes. - * NOTE: there are two slab users for this structure, if we add/modify/delete - * fields in structure for one of slab users, it may affect fields or size of - * other one, in this condition, it's better to split both of slab and related - * data structure. - */ +/* for the list of inodes to be GCed */ struct inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ @@ -177,46 +207,52 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; +struct bio_entry { + struct list_head list; + struct bio *bio; + struct completion event; + int error; +}; + /* for the list of fsync inodes, used only during recovery */ struct fsync_inode_entry { struct list_head list; /* list head */ struct inode *inode; /* vfs inode pointer */ block_t blkaddr; /* block address locating the last fsync */ block_t last_dentry; /* block address locating the last dentry */ - block_t last_inode; /* block address locating the last inode */ }; -#define nats_in_cursum(sum) (le16_to_cpu(sum->n_nats)) -#define sits_in_cursum(sum) (le16_to_cpu(sum->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) -#define nat_in_journal(sum, i) (sum->nat_j.entries[i].ne) -#define nid_in_journal(sum, i) (sum->nat_j.entries[i].nid) -#define sit_in_journal(sum, i) (sum->sit_j.entries[i].se) -#define segno_in_journal(sum, i) (sum->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) -#define MAX_NAT_JENTRIES(sum) (NAT_JOURNAL_ENTRIES - nats_in_cursum(sum)) -#define MAX_SIT_JENTRIES(sum) (SIT_JOURNAL_ENTRIES - sits_in_cursum(sum)) +#define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) +#define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) -static inline int update_nats_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { - int before = nats_in_cursum(rs); - rs->n_nats = cpu_to_le16(before + i); + int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } -static inline int update_sits_in_cursum(struct f2fs_summary_block *rs, int i) +static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { - int before = sits_in_cursum(rs); - rs->n_sits = cpu_to_le16(before + i); + int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } -static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, - int type) +static inline bool __has_cursum_space(struct f2fs_journal *journal, + int size, int type) { if (type == NAT_JOURNAL) - return size <= MAX_NAT_JENTRIES(sum); - return size <= MAX_SIT_JENTRIES(sum); + return size <= MAX_NAT_JENTRIES(journal); + return size <= MAX_SIT_JENTRIES(journal); } /* @@ -234,13 +270,13 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) #define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) +#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ + struct f2fs_move_range) -#define F2FS_IOC_SET_ENCRYPTION_POLICY \ - _IOR('f', 19, struct f2fs_encryption_policy) -#define F2FS_IOC_GET_ENCRYPTION_PWSALT \ - _IOW('f', 20, __u8[16]) -#define F2FS_IOC_GET_ENCRYPTION_POLICY \ - _IOW('f', 21, struct f2fs_encryption_policy) +#define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY +#define F2FS_IOC_GET_ENCRYPTION_PWSALT FS_IOC_GET_ENCRYPTION_PWSALT /* * should be same as XFS_IOC_GOINGDOWN. @@ -256,33 +292,27 @@ static inline bool __has_cursum_space(struct f2fs_summary_block *sum, int size, /* * ioctl commands in 32 bit emulation */ -#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS -#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETFLAGS FS_IOC32_GETFLAGS +#define F2FS_IOC32_SETFLAGS FS_IOC32_SETFLAGS +#define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_defragment { + u64 start; + u64 len; +}; + +struct f2fs_move_range { + u32 dst_fd; /* destination fd */ + u64 pos_in; /* start position in src_fd */ + u64 pos_out; /* start position in dst_fd */ + u64 len; /* size to move */ +}; + /* * For INODE and NODE manager */ /* for directory operations */ -struct f2fs_str { - unsigned char *name; - u32 len; -}; - -struct f2fs_filename { - const struct qstr *usr_fname; - struct f2fs_str disk_name; - f2fs_hash_t hash; -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_str crypto_buf; -#endif -}; - -#define FSTR_INIT(n, l) { .name = n, .len = l } -#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) -#define fname_name(p) ((p)->disk_name.name) -#define fname_len(p) ((p)->disk_name.len) - struct f2fs_dentry_ptr { struct inode *inode; const void *bitmap; @@ -350,6 +380,7 @@ struct extent_node { struct rb_node rb_node; /* rb node located in rb-tree */ struct list_head list; /* node in global extent list of sbi */ struct extent_info ei; /* extent info */ + struct extent_tree *et; /* extent tree pointer */ }; struct extent_tree { @@ -357,9 +388,9 @@ struct extent_tree { struct rb_root root; /* root of extent info rb-tree */ struct extent_node *cached_en; /* recently accessed extent node */ struct extent_info largest; /* largested extent info */ + struct list_head list; /* to be used by sbi->zombie_list */ rwlock_t lock; /* protect extent info rb-tree */ - atomic_t refcount; /* reference count of rb-tree */ - unsigned int count; /* # of extent node in rb-tree*/ + atomic_t node_cnt; /* # of extent node in rb-tree*/ }; /* @@ -378,6 +409,7 @@ struct f2fs_map_blocks { block_t m_lblk; unsigned int m_len; unsigned int m_flags; + pgoff_t *m_next_pgofs; /* point next possible non-hole pgofs */ }; /* for flag in get_data_block */ @@ -385,6 +417,8 @@ struct f2fs_map_blocks { #define F2FS_GET_BLOCK_DIO 1 #define F2FS_GET_BLOCK_FIEMAP 2 #define F2FS_GET_BLOCK_BMAP 3 +#define F2FS_GET_BLOCK_PRE_DIO 4 +#define F2FS_GET_BLOCK_PRE_AIO 5 /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -406,15 +440,6 @@ struct f2fs_map_blocks { #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) -/* Encryption algorithms */ -#define F2FS_ENCRYPTION_MODE_INVALID 0 -#define F2FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define F2FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define F2FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define F2FS_ENCRYPTION_MODE_AES_256_CTS 4 - -#include "f2fs_crypto.h" - #define DEF_DIR_LEVEL 0 struct f2fs_inode_info { @@ -429,30 +454,27 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - atomic_t dirty_pages; /* # of dirty pages */ + struct percpu_counter dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ unsigned long long xattr_ver; /* cp version of xattr modification */ - struct inode_entry *dirty_dir; /* the pointer of dirty dir */ + loff_t last_disk_size; /* lastly written file size */ + struct list_head dirty_list; /* dirty list for dirs and files */ + struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ struct mutex inmem_lock; /* lock for inmemory pages */ - struct extent_tree *extent_tree; /* cached extent_tree entry */ - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - /* Encryption params */ - struct f2fs_crypt_info *i_crypt_info; -#endif + struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ }; static inline void get_extent_info(struct extent_info *ext, - struct f2fs_extent i_ext) + struct f2fs_extent *i_ext) { - ext->fofs = le32_to_cpu(i_ext.fofs); - ext->blk = le32_to_cpu(i_ext.blk); - ext->len = le32_to_cpu(i_ext.len); + ext->fofs = le32_to_cpu(i_ext->fofs); + ext->blk = le32_to_cpu(i_ext->blk); + ext->len = le32_to_cpu(i_ext->len); } static inline void set_raw_extent(struct extent_info *ext, @@ -497,11 +519,14 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -static inline void __try_update_largest_extent(struct extent_tree *et, - struct extent_node *en) +extern void f2fs_mark_inode_dirty_sync(struct inode *); +static inline void __try_update_largest_extent(struct inode *inode, + struct extent_tree *et, struct extent_node *en) { - if (en->ei.len > et->largest.len) + if (en->ei.len > et->largest.len) { et->largest = en->ei; + f2fs_mark_inode_dirty_sync(inode); + } } struct f2fs_nm_info { @@ -511,6 +536,7 @@ struct f2fs_nm_info { nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ + unsigned int dirty_nats_ratio; /* control dirty nats ratio threshold */ /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ @@ -544,6 +570,9 @@ struct dnode_of_data { nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ bool inode_page_locked; /* inode page is locked or not */ + bool node_changed; /* is node block changed */ + char cur_level; /* level of hole node page */ + char max_level; /* level of current page located */ block_t data_blkaddr; /* block address of the node block */ }; @@ -594,6 +623,7 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ + atomic_t submit_flush; /* # of issued flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -618,6 +648,7 @@ struct f2fs_sm_info { /* for small discard management */ struct list_head discard_list; /* 4KB discard list */ + struct list_head wait_list; /* linked with issued discard bio */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -645,11 +676,12 @@ struct f2fs_sm_info { * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ enum count_type { - F2FS_WRITEBACK, F2FS_DIRTY_DENTS, + F2FS_DIRTY_DATA, F2FS_DIRTY_NODES, F2FS_DIRTY_META, F2FS_INMEM_PAGES, + F2FS_DIRTY_IMETA, NR_COUNT_TYPE, }; @@ -673,6 +705,7 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_REVOKE, IPU, OPU, }; @@ -681,7 +714,8 @@ struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ - block_t blk_addr; /* block address to be written */ + block_t new_blkaddr; /* new block address to be written */ + block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; @@ -695,6 +729,13 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +enum inode_type { + DIR_INODE, /* for dirty dir inode */ + FILE_INODE, /* for dirty regular/symlink inode */ + DIRTY_META, /* for all dirtied inode metadata */ + NR_INODE_TYPE, +}; + /* for inner inode cache management */ struct inode_management { struct radix_tree_root ino_root; /* ino entry array */ @@ -709,15 +750,31 @@ enum { SBI_IS_CLOSE, /* specify unmounting */ SBI_NEED_FSCK, /* need fsck.f2fs to fix */ SBI_POR_DOING, /* recovery is doing or not */ + SBI_NEED_SB_WRITE, /* need to recover superblock */ + SBI_NEED_CP, /* need to checkpoint */ }; +enum { + CP_TIME, + REQ_TIME, + MAX_TIME, +}; + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#define F2FS_KEY_DESC_PREFIX "f2fs:" +#define F2FS_KEY_DESC_PREFIX_SIZE 5 +#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ - struct buffer_head *raw_super_buf; /* buffer head of raw sb */ struct f2fs_super_block *raw_super; /* raw super block pointer */ - int s_flag; /* flags for sbi */ + int valid_super_block; /* valid super block no */ + unsigned long s_flag; /* flags for sbi */ +#ifdef CONFIG_F2FS_FS_ENCRYPTION + u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; + u8 key_prefix_size; +#endif /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -728,32 +785,36 @@ struct f2fs_sb_info { /* for bio operations */ struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ - struct mutex writepages; /* mutex for writepages() */ wait_queue_head_t cp_wait; - long cp_expires, cp_interval; /* next expected periodic cp */ + unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ + long interval_time[MAX_TIME]; /* to store thresholds */ struct inode_management im[MAX_INO_ENTRY]; /* manage inode cache */ /* for orphan inode, use 0'th array */ unsigned int max_orphans; /* max orphan inodes */ - /* for directory inode management */ - struct list_head dir_inode_list; /* dir inode list */ - spinlock_t dir_inode_lock; /* for dir inode list lock */ + /* for inode management */ + struct list_head inode_list[NR_INODE_TYPE]; /* dirty inode list */ + spinlock_t inode_lock[NR_INODE_TYPE]; /* for dirty inode list lock */ /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ - int total_ext_tree; /* extent tree count */ + atomic_t total_ext_tree; /* extent tree count */ + struct list_head zombie_list; /* extent zombie tree list */ + atomic_t total_zombie_tree; /* extent zombie tree count */ atomic_t total_ext_node; /* extent info count */ /* basic filesystem units */ @@ -770,17 +831,24 @@ struct f2fs_sb_info { unsigned int total_sections; /* total section count */ unsigned int total_node_count; /* total node block count */ unsigned int total_valid_node_count; /* valid node block count */ - unsigned int total_valid_inode_count; /* valid inode count */ + loff_t max_file_blocks; /* max block index of file */ int active_logs; /* # of active logs */ int dir_level; /* directory level */ block_t user_block_count; /* # of user blocks */ block_t total_valid_block_count; /* # of valid blocks */ - block_t alloc_valid_block_count; /* # of allocated blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_pages[NR_COUNT_TYPE]; /* # of pages, see count_type */ + atomic_t nr_wb_bios; /* # of writeback bios */ + + /* # of pages, see count_type */ + struct percpu_counter nr_pages[NR_COUNT_TYPE]; + /* # of allocated blocks */ + struct percpu_counter alloc_valid_block_count; + + /* valid inode count */ + struct percpu_counter total_valid_inode_count; struct f2fs_mount_info mount_opt; /* mount options */ @@ -809,7 +877,7 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ int bg_gc; /* background gc calls */ - unsigned int n_dirty_dirs; /* # of dir inodes */ + unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ @@ -822,11 +890,102 @@ struct f2fs_sb_info { struct list_head s_list; struct mutex umount_mutex; unsigned int shrinker_run_no; + + /* For write statistics */ + u64 sectors_written_start; + u64 kbytes_written; + + /* Reference to checksum algorithm driver via cryptoapi */ + struct crypto_shash *s_chksum_driver; + + /* For fault injection */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info fault_info; +#endif }; +#ifdef CONFIG_F2FS_FAULT_INJECTION +static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (!ffi->inject_rate) + return false; + + if (!IS_FAULT_SET(ffi, type)) + return false; + + atomic_inc(&ffi->inject_ops); + if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { + atomic_set(&ffi->inject_ops, 0); + printk("%sF2FS-fs : inject %s in %pF\n", + KERN_INFO, + fault_name[type], + __builtin_return_address(0)); + return true; + } + return false; +} +#endif + +/* For write statistics. Suppose sector size is 512 bytes, + * and the return value is in kbytes. s is of struct f2fs_sb_info. + */ +#define BD_PART_WRITTEN(s) \ +(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ + s->sectors_written_start) >> 1) + +static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) +{ + sbi->last_time[type] = jiffies; +} + +static inline bool f2fs_time_over(struct f2fs_sb_info *sbi, int type) +{ + struct timespec ts = {sbi->interval_time[type], 0}; + unsigned long interval = timespec_to_jiffies(&ts); + + return time_after(jiffies, sbi->last_time[type] + interval); +} + +static inline bool is_idle(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + struct request_queue *q = bdev_get_queue(bdev); + struct request_list *rl = &q->root_rl; + + if (rl->count[BLK_RW_SYNC] || rl->count[BLK_RW_ASYNC]) + return 0; + + return f2fs_time_over(sbi, REQ_TIME); +} + /* * Inline functions */ +static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, + unsigned int length) +{ + SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); + u32 *ctx = (u32 *)shash_desc_ctx(shash); + int err; + + shash->tfm = sbi->s_chksum_driver; + shash->flags = 0; + *ctx = F2FS_SUPER_MAGIC; + + err = crypto_shash_update(shash, address, length); + BUG_ON(err); + + return *ctx; +} + +static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, + void *buf, size_t buf_size) +{ + return f2fs_crc32(sbi, buf, buf_size) == blk_crc; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -909,17 +1068,17 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { - return sbi->s_flag & (0x01 << type); + return test_bit(type, &sbi->s_flag); } static inline void set_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag |= (0x01 << type); + set_bit(type, &sbi->s_flag); } static inline void clear_sbi_flag(struct f2fs_sb_info *sbi, unsigned int type) { - sbi->s_flag &= ~(0x01 << type); + clear_bit(type, &sbi->s_flag); } static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) @@ -927,26 +1086,57 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } -static inline bool is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return ckpt_flags & f; } -static inline void set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline bool is_set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + return __is_set_ckpt_flags(F2FS_CKPT(sbi), f); +} + +static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags |= f; cp->ckpt_flags = cpu_to_le32(ckpt_flags); } -static inline void clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); + spin_lock(&sbi->cp_lock); + __set_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) +{ + unsigned int ckpt_flags; + + ckpt_flags = le32_to_cpu(cp->ckpt_flags); ckpt_flags &= (~f); cp->ckpt_flags = cpu_to_le32(ckpt_flags); } +static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) +{ + spin_lock(&sbi->cp_lock); + __clear_ckpt_flags(F2FS_CKPT(sbi), f); + spin_unlock(&sbi->cp_lock); +} + +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q); +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -959,7 +1149,7 @@ static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) static inline void f2fs_lock_all(struct f2fs_sb_info *sbi) { - f2fs_down_write(&sbi->cp_rwsem, &sbi->cp_mutex); + down_write(&sbi->cp_rwsem); } static inline void f2fs_unlock_all(struct f2fs_sb_info *sbi) @@ -985,8 +1175,8 @@ static inline bool __remain_node_summaries(int reason) static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) { - return (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG) || - is_set_ckpt_flags(F2FS_CKPT(sbi), CP_FASTBOOT_FLAG)); + return (is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG) || + is_set_ckpt_flags(sbi, CP_FASTBOOT_FLAG)); } /* @@ -1019,22 +1209,37 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } +static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, - struct inode *inode, blkcnt_t count) + struct inode *inode, blkcnt_t *count) { - block_t valid_block_count; + blkcnt_t diff; + +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_BLOCK)) + return false; +#endif + /* + * let's increase this in prior to actual block count change in order + * for f2fs_sync_file to avoid data races when deciding checkpoint. + */ + percpu_counter_add(&sbi->alloc_valid_block_count, (*count)); spin_lock(&sbi->stat_lock); - valid_block_count = - sbi->total_valid_block_count + (block_t)count; - if (unlikely(valid_block_count > sbi->user_block_count)) { - spin_unlock(&sbi->stat_lock); - return false; + sbi->total_valid_block_count += (block_t)(*count); + if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { + diff = sbi->total_valid_block_count - sbi->user_block_count; + *count -= diff; + sbi->total_valid_block_count = sbi->user_block_count; + if (!*count) { + spin_unlock(&sbi->stat_lock); + percpu_counter_sub(&sbi->alloc_valid_block_count, diff); + return false; + } } - inode->i_blocks += count; - sbi->total_valid_block_count = valid_block_count; - sbi->alloc_valid_block_count += (block_t)count; spin_unlock(&sbi->stat_lock); + + f2fs_i_blocks_write(inode, *count, true); return true; } @@ -1045,27 +1250,31 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); f2fs_bug_on(sbi, inode->i_blocks < count); - inode->i_blocks -= count; sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); + f2fs_i_blocks_write(inode, count, false); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_inc(&sbi->nr_pages[count_type]); + percpu_counter_inc(&sbi->nr_pages[count_type]); + + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + return; + set_sbi_flag(sbi, SBI_IS_DIRTY); } static inline void inode_inc_dirty_pages(struct inode *inode) { - atomic_inc(&F2FS_I(inode)->dirty_pages); - if (S_ISDIR(inode->i_mode)) - inc_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - atomic_dec(&sbi->nr_pages[count_type]); + percpu_counter_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1074,28 +1283,28 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - atomic_dec(&F2FS_I(inode)->dirty_pages); - - if (S_ISDIR(inode->i_mode)) - dec_page_count(F2FS_I_SB(inode), F2FS_DIRTY_DENTS); + percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? + F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } -static inline int get_pages(struct f2fs_sb_info *sbi, int count_type) +static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return atomic_read(&sbi->nr_pages[count_type]); + return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); } -static inline int get_dirty_pages(struct inode *inode) +static inline s64 get_dirty_pages(struct inode *inode) { - return atomic_read(&F2FS_I(inode)->dirty_pages); + return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) { - unsigned int pages_per_sec = sbi->segs_per_sec * - (1 << sbi->log_blocks_per_seg); - return ((get_pages(sbi, block_type) + pages_per_sec - 1) - >> sbi->log_blocks_per_seg) / sbi->segs_per_sec; + unsigned int pages_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + unsigned int segs = (get_pages(sbi, block_type) + pages_per_sec - 1) >> + sbi->log_blocks_per_seg; + + return segs / sbi->segs_per_sec; } static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) @@ -1103,6 +1312,11 @@ static inline block_t valid_user_blocks(struct f2fs_sb_info *sbi) return sbi->total_valid_block_count; } +static inline block_t discard_blocks(struct f2fs_sb_info *sbi) +{ + return sbi->discard_blks; +} + static inline unsigned long __bitmap_size(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); @@ -1182,13 +1396,13 @@ static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, } if (inode) - inode->i_blocks++; + f2fs_i_blocks_write(inode, 1, true); - sbi->alloc_valid_block_count++; sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->alloc_valid_block_count); return true; } @@ -1201,7 +1415,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, f2fs_bug_on(sbi, !sbi->total_valid_node_count); f2fs_bug_on(sbi, !inode->i_blocks); - inode->i_blocks--; + f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; @@ -1215,28 +1429,30 @@ static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) static inline void inc_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_inode_count == sbi->total_node_count); - sbi->total_valid_inode_count++; - spin_unlock(&sbi->stat_lock); + percpu_counter_inc(&sbi->total_valid_inode_count); } static inline void dec_valid_inode_count(struct f2fs_sb_info *sbi) { - spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, !sbi->total_valid_inode_count); - sbi->total_valid_inode_count--; - spin_unlock(&sbi->stat_lock); + percpu_counter_dec(&sbi->total_valid_inode_count); } -static inline unsigned int valid_inode_count(struct f2fs_sb_info *sbi) +static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) { - return sbi->total_valid_inode_count; + return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, pgoff_t index, bool for_write) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct page *page = find_lock_page(mapping, index); + if (page) + return page; + + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + return NULL; +#endif if (!for_write) return grab_cache_page(mapping, index); return grab_cache_page_write_begin(mapping, index, AOP_FLAG_NOFS); @@ -1261,7 +1477,7 @@ static inline void f2fs_put_page(struct page *page, int unlock) f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); unlock_page(page); } - page_cache_release(page); + put_page(page); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -1396,13 +1612,12 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) enum { FI_NEW_INODE, /* indicate newly allocated inode */ FI_DIRTY_INODE, /* indicate inode is dirty or not */ + FI_AUTO_RECOVER, /* indicate inode is recoverable */ FI_DIRTY_DIR, /* indicate directory has dirty pages */ FI_INC_LINK, /* need to increment i_nlink */ FI_ACL_MODE, /* indicate acl mode */ FI_NO_ALLOC, /* should not allocate any blocks */ FI_FREE_NID, /* free allocated nide */ - FI_UPDATE_DIR, /* should update inode block for consistency */ - FI_DELAY_IPUT, /* used for the recovery */ FI_NO_EXTENT, /* not to use the extent cache */ FI_INLINE_XATTR, /* used for inline xattr */ FI_INLINE_DATA, /* used for inline data*/ @@ -1416,71 +1631,152 @@ enum { FI_DROP_CACHE, /* drop dirty page cache */ FI_DATA_EXIST, /* indicate data exists */ FI_INLINE_DOTS, /* indicate inline dot dentries */ + FI_DO_DEFRAG, /* indicate defragment is running */ + FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ }; -static inline void set_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline void __mark_inode_dirty_flag(struct inode *inode, + int flag, bool set) { - if (!test_bit(flag, &fi->flags)) - set_bit(flag, &fi->flags); + switch (flag) { + case FI_INLINE_XATTR: + case FI_INLINE_DATA: + case FI_INLINE_DENTRY: + if (set) + return; + case FI_DATA_EXIST: + case FI_INLINE_DOTS: + f2fs_mark_inode_dirty_sync(inode); + } } -static inline int is_inode_flag_set(struct f2fs_inode_info *fi, int flag) +static inline void set_inode_flag(struct inode *inode, int flag) { - return test_bit(flag, &fi->flags); + if (!test_bit(flag, &F2FS_I(inode)->flags)) + set_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, true); } -static inline void clear_inode_flag(struct f2fs_inode_info *fi, int flag) +static inline int is_inode_flag_set(struct inode *inode, int flag) { - if (test_bit(flag, &fi->flags)) - clear_bit(flag, &fi->flags); + return test_bit(flag, &F2FS_I(inode)->flags); } -static inline void set_acl_inode(struct f2fs_inode_info *fi, umode_t mode) +static inline void clear_inode_flag(struct inode *inode, int flag) { - fi->i_acl_mode = mode; - set_inode_flag(fi, FI_ACL_MODE); + if (test_bit(flag, &F2FS_I(inode)->flags)) + clear_bit(flag, &F2FS_I(inode)->flags); + __mark_inode_dirty_flag(inode, flag, false); } -static inline void get_inline_info(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_acl_inode(struct inode *inode, umode_t mode) { + F2FS_I(inode)->i_acl_mode = mode; + set_inode_flag(inode, FI_ACL_MODE); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_links_write(struct inode *inode, bool inc) +{ + if (inc) + inc_nlink(inode); + else + drop_nlink(inode); + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_blocks_write(struct inode *inode, + blkcnt_t diff, bool add) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + inode->i_blocks = add ? inode->i_blocks + diff : + inode->i_blocks - diff; + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) +{ + bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); + bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); + + if (i_size_read(inode) == i_size) + return; + + i_size_write(inode, i_size); + f2fs_mark_inode_dirty_sync(inode); + if (clean || recover) + set_inode_flag(inode, FI_AUTO_RECOVER); +} + +static inline bool f2fs_skip_inode_update(struct inode *inode) +{ + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + +static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) +{ + F2FS_I(inode)->i_current_depth = depth; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) +{ + F2FS_I(inode)->i_xattr_nid = xnid; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) +{ + F2FS_I(inode)->i_pino = pino; + f2fs_mark_inode_dirty_sync(inode); +} + +static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + if (ri->i_inline & F2FS_INLINE_XATTR) - set_inode_flag(fi, FI_INLINE_XATTR); + set_bit(FI_INLINE_XATTR, &fi->flags); if (ri->i_inline & F2FS_INLINE_DATA) - set_inode_flag(fi, FI_INLINE_DATA); + set_bit(FI_INLINE_DATA, &fi->flags); if (ri->i_inline & F2FS_INLINE_DENTRY) - set_inode_flag(fi, FI_INLINE_DENTRY); + set_bit(FI_INLINE_DENTRY, &fi->flags); if (ri->i_inline & F2FS_DATA_EXIST) - set_inode_flag(fi, FI_DATA_EXIST); + set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) - set_inode_flag(fi, FI_INLINE_DOTS); + set_bit(FI_INLINE_DOTS, &fi->flags); } -static inline void set_raw_inline(struct f2fs_inode_info *fi, - struct f2fs_inode *ri) +static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) { ri->i_inline = 0; - if (is_inode_flag_set(fi, FI_INLINE_XATTR)) + if (is_inode_flag_set(inode, FI_INLINE_XATTR)) ri->i_inline |= F2FS_INLINE_XATTR; - if (is_inode_flag_set(fi, FI_INLINE_DATA)) + if (is_inode_flag_set(inode, FI_INLINE_DATA)) ri->i_inline |= F2FS_INLINE_DATA; - if (is_inode_flag_set(fi, FI_INLINE_DENTRY)) + if (is_inode_flag_set(inode, FI_INLINE_DENTRY)) ri->i_inline |= F2FS_INLINE_DENTRY; - if (is_inode_flag_set(fi, FI_DATA_EXIST)) + if (is_inode_flag_set(inode, FI_DATA_EXIST)) ri->i_inline |= F2FS_DATA_EXIST; - if (is_inode_flag_set(fi, FI_INLINE_DOTS)) + if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; } static inline int f2fs_has_inline_xattr(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_XATTR); + return is_inode_flag_set(inode, FI_INLINE_XATTR); } -static inline unsigned int addrs_per_inode(struct f2fs_inode_info *fi) +static inline unsigned int addrs_per_inode(struct inode *inode) { - if (f2fs_has_inline_xattr(&fi->vfs_inode)) + if (f2fs_has_inline_xattr(inode)) return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; return DEF_ADDRS_PER_INODE; } @@ -1502,43 +1798,43 @@ static inline int inline_xattr_size(struct inode *inode) static inline int f2fs_has_inline_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DATA); + return is_inode_flag_set(inode, FI_INLINE_DATA); } static inline void f2fs_clear_inline_inode(struct inode *inode) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - clear_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + clear_inode_flag(inode, FI_INLINE_DATA); + clear_inode_flag(inode, FI_DATA_EXIST); } static inline int f2fs_exist_data(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DATA_EXIST); + return is_inode_flag_set(inode, FI_DATA_EXIST); } static inline int f2fs_has_inline_dots(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DOTS); + return is_inode_flag_set(inode, FI_INLINE_DOTS); } static inline bool f2fs_is_atomic_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_ATOMIC_FILE); + return is_inode_flag_set(inode, FI_ATOMIC_FILE); } static inline bool f2fs_is_volatile_file(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_VOLATILE_FILE); + return is_inode_flag_set(inode, FI_VOLATILE_FILE); } static inline bool f2fs_is_first_block_written(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + return is_inode_flag_set(inode, FI_FIRST_BLOCK_WRITTEN); } static inline bool f2fs_is_drop_cache(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_DROP_CACHE); + return is_inode_flag_set(inode, FI_DROP_CACHE); } static inline void *inline_data_addr(struct page *page) @@ -1549,7 +1845,7 @@ static inline void *inline_data_addr(struct page *page) static inline int f2fs_has_inline_dentry(struct inode *inode) { - return is_inode_flag_set(F2FS_I(inode), FI_INLINE_DENTRY); + return is_inode_flag_set(inode, FI_INLINE_DENTRY); } static inline void f2fs_dentry_kunmap(struct inode *dir, struct page *page) @@ -1566,11 +1862,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; + f2fs_mark_inode_dirty_sync(inode); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; + f2fs_mark_inode_dirty_sync(inode); } static inline int f2fs_readonly(struct super_block *sb) @@ -1580,13 +1878,7 @@ static inline int f2fs_readonly(struct super_block *sb) static inline bool f2fs_cp_error(struct f2fs_sb_info *sbi) { - return is_set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); -} - -static inline void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi) -{ - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - sbi->sb->s_flags |= MS_RDONLY; + return is_set_ckpt_flags(sbi, CP_ERROR_FLAG); } static inline bool is_dot_dotdot(const struct qstr *str) @@ -1602,13 +1894,21 @@ static inline bool is_dot_dotdot(const struct qstr *str) static inline bool f2fs_may_extent_tree(struct inode *inode) { - mode_t mode = inode->i_mode; - if (!test_opt(F2FS_I_SB(inode), EXTENT_CACHE) || - is_inode_flag_set(F2FS_I(inode), FI_NO_EXTENT)) + is_inode_flag_set(inode, FI_NO_EXTENT)) return false; - return S_ISREG(mode); + return S_ISREG(inode->i_mode); +} + +static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, + size_t size, gfp_t flags) +{ +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_KMALLOC)) + return NULL; +#endif + return kmalloc(size, flags); } static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) @@ -1632,14 +1932,14 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) } #define get_inode_mode(i) \ - ((is_inode_flag_set(F2FS_I(i), FI_ACL_MODE)) ? \ + ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) /* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, fi) \ - ((pgofs < ADDRS_PER_INODE(fi)) ? ADDRS_PER_INODE(fi) : \ - (pgofs - ADDRS_PER_INODE(fi) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi)) +#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ + ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ + (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ + ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) /* * file.c @@ -1647,7 +1947,7 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) int f2fs_sync_file(struct file *, loff_t, loff_t, int); void truncate_data_blocks(struct dnode_of_data *); int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *, bool); +int f2fs_truncate(struct inode *); int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); int f2fs_setattr(struct dentry *, struct iattr *); int truncate_hole(struct inode *, pgoff_t, pgoff_t); @@ -1660,9 +1960,10 @@ long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); */ void f2fs_set_inode_flags(struct inode *); struct inode *f2fs_iget(struct super_block *, unsigned long); +struct inode *f2fs_iget_retry(struct super_block *, unsigned long); int try_to_free_nats(struct f2fs_sb_info *, int); -void update_inode(struct inode *, struct page *); -void update_inode_page(struct inode *); +int update_inode(struct inode *, struct page *); +int update_inode_page(struct inode *); int f2fs_write_inode(struct inode *, struct writeback_control *); void f2fs_evict_inode(struct inode *); void handle_failed_inode(struct inode *); @@ -1675,29 +1976,34 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -extern unsigned char f2fs_filetype_table[F2FS_FT_MAX]; void set_de_type(struct f2fs_dir_entry *, umode_t); - -struct f2fs_dir_entry *find_target_dentry(struct f2fs_filename *, +unsigned char get_de_type(struct f2fs_dir_entry *); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct f2fs_str *); + unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, struct page *); + const struct qstr *, const struct qstr *, struct page *); void update_parent_metadata(struct inode *, struct inode *, unsigned int); int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *, struct page *); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, struct qstr *, +void f2fs_drop_nlink(struct inode *, struct inode *); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, + struct page **); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, struct page **); struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, struct qstr *); +ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, struct page *, struct inode *); int update_dent_inode(struct inode *, struct inode *, const struct qstr *); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, const struct qstr *, f2fs_hash_t , unsigned int); +int f2fs_add_regular_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); +int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, + nid_t, umode_t); int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, @@ -1714,16 +2020,18 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ +int f2fs_inode_dirtied(struct inode *); +void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); extern __printf(3, 4) void f2fs_msg(struct super_block *, const char *, const char *, ...); +int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, - struct f2fs_filename *fname); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *); /* * node.c @@ -1736,6 +2044,7 @@ int need_dentry_mark(struct f2fs_sb_info *, nid_t); bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); bool need_inode_block_update(struct f2fs_sb_info *, nid_t); void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); +pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); int truncate_inode_blocks(struct inode *, pgoff_t); int truncate_xattr_node(struct inode *, struct page *); @@ -1746,8 +2055,11 @@ struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); void ra_node_page(struct f2fs_sb_info *, nid_t); struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); struct page *get_node_page_ra(struct page *, int); -void sync_inode_page(struct dnode_of_data *); -int sync_node_pages(struct f2fs_sb_info *, nid_t, struct writeback_control *); +void move_node_page(struct page *, int); +int fsync_node_pages(struct f2fs_sb_info *, struct inode *, + struct writeback_control *, bool); +int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); +void build_free_nids(struct f2fs_sb_info *); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); @@ -1767,8 +2079,9 @@ void destroy_node_manager_caches(void); * segment.c */ void register_inmem_page(struct inode *, struct page *); -int commit_inmem_pages(struct inode *, bool); -void f2fs_balance_fs(struct f2fs_sb_info *); +void drop_inmem_pages(struct inode *); +int commit_inmem_pages(struct inode *); +void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); @@ -1778,7 +2091,6 @@ bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); -bool discard_next_dnode(struct f2fs_sb_info *, block_t); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); @@ -1788,16 +2100,17 @@ void write_meta_page(struct f2fs_sb_info *, struct page *); void write_node_page(unsigned int, struct f2fs_io_info *); void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); void rewrite_data_page(struct f2fs_io_info *); +void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, + block_t, block_t, bool, bool); void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool); + block_t, block_t, unsigned char, bool, bool); void allocate_data_block(struct f2fs_sb_info *, struct page *, block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type); +void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); void write_data_summaries(struct f2fs_sb_info *, block_t); void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_summary_block *, - int, unsigned int, int); +int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); int build_segment_manager(struct f2fs_sb_info *); void destroy_segment_manager(struct f2fs_sb_info *); @@ -1807,6 +2120,7 @@ void destroy_segment_manager_caches(void); /* * checkpoint.c */ +void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); @@ -1814,21 +2128,21 @@ bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void remove_dirty_inode(struct f2fs_sb_info *, nid_t, int type); -void release_dirty_inode(struct f2fs_sb_info *); +void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); +void release_ino_entry(struct f2fs_sb_info *, bool); bool exist_written_data(struct f2fs_sb_info *, nid_t, int); +int f2fs_sync_inode_meta(struct f2fs_sb_info *); int acquire_orphan_inode(struct f2fs_sb_info *); void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct f2fs_sb_info *, nid_t); +void add_orphan_inode(struct inode *); void remove_orphan_inode(struct f2fs_sb_info *, nid_t); int recover_orphan_inodes(struct f2fs_sb_info *); int get_valid_checkpoint(struct f2fs_sb_info *); void update_dirty_page(struct inode *, struct page *); -void add_dirty_dir_inode(struct inode *); -void remove_dirty_dir_inode(struct inode *); -void sync_dirty_dir_inodes(struct f2fs_sb_info *); -void write_checkpoint(struct f2fs_sb_info *, struct cp_control *); +void remove_dirty_inode(struct inode *); +int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); +int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); void init_ino_entry_info(struct f2fs_sb_info *); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); @@ -1837,34 +2151,46 @@ void destroy_checkpoint_caches(void); * data.c */ void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); +void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, + struct page *, nid_t, enum page_type, int); +void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); void set_data_blkaddr(struct dnode_of_data *); +void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); +int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); +ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); struct page *get_lock_data_page(struct inode *, pgoff_t, bool); struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); int do_write_data_page(struct f2fs_io_info *); +int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); +void f2fs_set_page_dirty_nobuffers(struct page *); void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); int f2fs_release_page(struct page *, gfp_t); +#ifdef CONFIG_MIGRATION +int f2fs_migrate_page(struct address_space *, struct page *, struct page *, + enum migrate_mode); +#endif /* * gc.c */ int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct f2fs_inode_info *); +block_t start_bidx_of_node(unsigned int, struct inode *); int f2fs_gc(struct f2fs_sb_info *, bool); void build_gc_manager(struct f2fs_sb_info *); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *, bool); bool space_for_roll_forward(struct f2fs_sb_info *); /* @@ -1878,18 +2204,20 @@ struct f2fs_stat_info { int main_area_segs, main_area_sections, main_area_zones; unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; - int ext_tree, ext_node; - int ndirty_node, ndirty_dent, ndirty_dirs, ndirty_meta; + int ext_tree, zombie_tree, ext_node; + s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + s64 inmem_pages; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, fnids; int total_count, utilization; - int bg_gc, inmem_pages, wb_pages; - int inline_xattr, inline_inode, inline_dir; - unsigned int valid_count, valid_node_count, valid_inode_count; + int bg_gc, wb_bios; + int inline_xattr, inline_inode, inline_dir, orphans; + unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; int rsvd_segs, overp_segs; int dirty_count, node_pages, meta_pages; - int prefree_count, call_count, cp_count; + int prefree_count, call_count, cp_count, bg_cp_count; int tot_segs, node_segs, data_segs, free_segs, free_secs; int bg_node_segs, bg_data_segs; int tot_blks, data_blks, node_blks; @@ -1910,10 +2238,11 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } #define stat_inc_cp_count(si) ((si)->cp_count++) +#define stat_inc_bg_cp_count(si) ((si)->bg_cp_count++) #define stat_inc_call_count(si) ((si)->call_count++) #define stat_inc_bggc_count(sbi) ((sbi)->bg_gc++) -#define stat_inc_dirty_dir(sbi) ((sbi)->n_dirty_dirs++) -#define stat_dec_dirty_dir(sbi) ((sbi)->n_dirty_dirs--) +#define stat_inc_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]++) +#define stat_dec_dirty_inode(sbi, type) ((sbi)->ndirty_inode[type]--) #define stat_inc_total_hit(sbi) (atomic64_inc(&(sbi)->total_hit_ext)) #define stat_inc_rbtree_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_rbtree)) #define stat_inc_largest_node_hit(sbi) (atomic64_inc(&(sbi)->read_hit_largest)) @@ -1988,14 +2317,15 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) int f2fs_build_stats(struct f2fs_sb_info *); void f2fs_destroy_stats(struct f2fs_sb_info *); -void __init f2fs_create_root_stats(void); +int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else #define stat_inc_cp_count(si) +#define stat_inc_bg_cp_count(si) #define stat_inc_call_count(si) #define stat_inc_bggc_count(si) -#define stat_inc_dirty_dir(sbi) -#define stat_dec_dirty_dir(sbi) +#define stat_inc_dirty_inode(sbi, type) +#define stat_dec_dirty_inode(sbi, type) #define stat_inc_total_hit(sb) #define stat_inc_rbtree_node_hit(sb) #define stat_inc_largest_node_hit(sbi) @@ -2016,7 +2346,7 @@ void f2fs_destroy_root_stats(void); static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } -static inline void __init f2fs_create_root_stats(void) { } +static inline int __init f2fs_create_root_stats(void) { return 0; } static inline void f2fs_destroy_root_stats(void) { } #endif @@ -2045,16 +2375,15 @@ int f2fs_convert_inline_inode(struct inode *); int f2fs_write_inline_data(struct inode *, struct page *); bool recover_inline_data(struct inode *, struct page *); struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct f2fs_filename *, struct page **); -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *, struct page **); + struct fscrypt_name *, struct page **); int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, struct inode *, - nid_t, umode_t); +int f2fs_add_inline_entry(struct inode *, const struct qstr *, + const struct qstr *, struct inode *, nid_t, umode_t); void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, struct inode *, struct inode *); bool f2fs_empty_inline_dir(struct inode *); int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct f2fs_str *); + struct fscrypt_str *); int f2fs_inline_data_fiemap(struct inode *, struct fiemap_extent_info *, __u64, __u64); @@ -2070,8 +2399,8 @@ void f2fs_leave_shrinker(struct f2fs_sb_info *); * extent_cache.c */ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -void f2fs_drop_largest_extent(struct inode *, pgoff_t); -void f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); +void f2fs_drop_extent_tree(struct inode *); unsigned int f2fs_destroy_extent_node(struct inode *); void f2fs_destroy_extent_tree(struct inode *); bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); @@ -2085,13 +2414,9 @@ void destroy_extent_cache(void); /* * crypto support */ -static inline int f2fs_encrypted_inode(struct inode *inode) +static inline bool f2fs_encrypted_inode(struct inode *inode) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return file_is_encrypt(inode); -#else - return 0; -#endif } static inline void f2fs_set_encrypted_inode(struct inode *inode) @@ -2103,26 +2428,38 @@ static inline void f2fs_set_encrypted_inode(struct inode *inode) static inline bool f2fs_bio_encrypted(struct bio *bio) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - return unlikely(bio->bi_private != NULL); -#else - return false; -#endif + return bio->bi_private != NULL; } static inline int f2fs_sb_has_crypto(struct super_block *sb) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); -#else - return 0; -#endif +} + +static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); +} + +static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) +{ + clear_opt(sbi, ADAPTIVE); + clear_opt(sbi, LFS); + + switch (mt) { + case F2FS_MOUNT_ADAPTIVE: + set_opt(sbi, ADAPTIVE); + break; + case F2FS_MOUNT_LFS: + set_opt(sbi, LFS); + break; + } } static inline bool f2fs_may_encrypt(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION - mode_t mode = inode->i_mode; + umode_t mode = inode->i_mode; return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)); #else @@ -2130,74 +2467,28 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -/* crypto_policy.c */ -int f2fs_is_child_context_consistent_with_parent(struct inode *, - struct inode *); -int f2fs_inherit_context(struct inode *, struct inode *, struct page *); -int f2fs_process_policy(const struct f2fs_encryption_policy *, struct inode *); -int f2fs_get_policy(struct inode *, struct f2fs_encryption_policy *); - -/* crypt.c */ -extern struct kmem_cache *f2fs_crypt_info_cachep; -bool f2fs_valid_contents_enc_mode(uint32_t); -uint32_t f2fs_validate_encryption_key_size(uint32_t, uint32_t); -struct f2fs_crypto_ctx *f2fs_get_crypto_ctx(struct inode *); -void f2fs_release_crypto_ctx(struct f2fs_crypto_ctx *); -struct page *f2fs_encrypt(struct inode *, struct page *); -int f2fs_decrypt(struct f2fs_crypto_ctx *, struct page *); -int f2fs_decrypt_one(struct inode *, struct page *); -void f2fs_end_io_crypto_work(struct f2fs_crypto_ctx *, struct bio *); - -/* crypto_key.c */ -void f2fs_free_encryption_info(struct inode *, struct f2fs_crypt_info *); - -/* crypto_fname.c */ -bool f2fs_valid_filenames_enc_mode(uint32_t); -u32 f2fs_fname_crypto_round_up(u32, u32); -int f2fs_fname_crypto_alloc_buffer(struct inode *, u32, struct f2fs_str *); -int f2fs_fname_disk_to_usr(struct inode *, f2fs_hash_t *, - const struct f2fs_str *, struct f2fs_str *); -int f2fs_fname_usr_to_disk(struct inode *, const struct qstr *, - struct f2fs_str *); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION -void f2fs_restore_and_release_control_page(struct page **); -void f2fs_restore_control_page(struct page *); - -int __init f2fs_init_crypto(void); -int f2fs_crypto_initialize(void); -void f2fs_exit_crypto(void); - -int f2fs_has_encryption_key(struct inode *); - -int f2fs_get_encryption_info(struct inode *inode); - -void f2fs_fname_crypto_free_buffer(struct f2fs_str *); -int f2fs_fname_setup_filename(struct inode *, const struct qstr *, - int lookup, struct f2fs_filename *); -void f2fs_fname_free_filename(struct f2fs_filename *); -#else -static inline void f2fs_restore_and_release_control_page(struct page **p) { } -static inline void f2fs_restore_control_page(struct page *p) { } - -static inline int __init f2fs_init_crypto(void) { return 0; } -static inline void f2fs_exit_crypto(void) { } - -static inline int f2fs_has_encryption_key(struct inode *i) { return 0; } -static inline int f2fs_get_encryption_info(struct inode *i) { return 0; } -static inline void f2fs_fname_crypto_free_buffer(struct f2fs_str *p) { } - -static inline int f2fs_fname_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct f2fs_filename *fname) -{ - memset(fname, 0, sizeof(struct f2fs_filename)); - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *)iname->name; - fname->disk_name.len = iname->len; - return 0; -} - -static inline void f2fs_fname_free_filename(struct f2fs_filename *fname) { } +#ifndef CONFIG_F2FS_FS_ENCRYPTION +#define fscrypt_set_d_op(i) +#define fscrypt_get_ctx fscrypt_notsupp_get_ctx +#define fscrypt_release_ctx fscrypt_notsupp_release_ctx +#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page +#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page +#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages +#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page +#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page +#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range +#define fscrypt_process_policy fscrypt_notsupp_process_policy +#define fscrypt_get_policy fscrypt_notsupp_get_policy +#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context +#define fscrypt_inherit_context fscrypt_notsupp_inherit_context +#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info +#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info +#define fscrypt_setup_filename fscrypt_notsupp_setup_filename +#define fscrypt_free_filename fscrypt_notsupp_free_filename +#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size +#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer +#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer +#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr +#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk #endif #endif diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 4b449d263333..c6e33258fabf 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,8 @@ #include #include #include +#include +#include #include "f2fs.h" #include "node.h" @@ -40,8 +42,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct dnode_of_data dn; int err; - f2fs_balance_fs(sbi); - sb_start_pagefault(inode->i_sb); f2fs_bug_on(sbi, f2fs_has_inline_data(inode)); @@ -57,6 +57,8 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + f2fs_balance_fs(sbi, dn.node_changed); + file_update_time(vma->vm_file); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || @@ -74,19 +76,20 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, goto mapped; /* page is wholly or partially inside EOF */ - if (((loff_t)(page->index + 1) << PAGE_CACHE_SHIFT) > + if (((loff_t)(page->index + 1) << PAGE_SHIFT) > i_size_read(inode)) { unsigned offset; - offset = i_size_read(inode) & ~PAGE_CACHE_MASK; - zero_user_segment(page, offset, PAGE_CACHE_SIZE); + offset = i_size_read(inode) & ~PAGE_MASK; + zero_user_segment(page, offset, PAGE_SIZE); } set_page_dirty(page); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) @@ -96,6 +99,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); + f2fs_update_time(sbi, REQ_TIME); return block_page_mkwrite_return(err); } @@ -132,7 +136,7 @@ static inline bool need_do_checkpoint(struct inode *inode) if (!S_ISREG(inode->i_mode) || inode->i_nlink != 1) need_cp = true; - else if (file_enc_name(inode) && need_dentry_mark(sbi, inode->i_ino)) + else if (is_sbi_flag_set(sbi, SBI_NEED_CP)) need_cp = true; else if (file_wrong_pino(inode)) need_cp = true; @@ -170,21 +174,16 @@ static void try_to_fix_pino(struct inode *inode) fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { - fi->i_pino = pino; + f2fs_i_pino_write(inode, pino); file_got_pino(inode); - up_write(&fi->i_sem); - - mark_inode_dirty_sync(inode); - f2fs_write_inode(inode, NULL); - } else { - up_write(&fi->i_sem); } + up_write(&fi->i_sem); } -int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, + int datasync, bool atomic) { struct inode *inode = file->f_mapping->host; - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t ino = inode->i_ino; int ret = 0; @@ -201,10 +200,10 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trace_f2fs_sync_file_enter(inode); /* if fdatasync is triggered, let's do in-place-update */ - if (get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) - set_inode_flag(fi, FI_NEED_IPU); + if (datasync || get_dirty_pages(inode) <= SM_I(sbi)->min_fsync_blocks) + set_inode_flag(inode, FI_NEED_IPU); ret = filemap_write_and_wait_range(inode->i_mapping, start, end); - clear_inode_flag(fi, FI_NEED_IPU); + clear_inode_flag(inode, FI_NEED_IPU); if (ret) { trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -212,7 +211,7 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) } /* if the inode is dirty, let's recover all the time */ - if (!datasync) { + if (!datasync && !f2fs_skip_inode_update(inode)) { f2fs_write_inode(inode, NULL); goto go_write; } @@ -220,29 +219,26 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) /* * if there is no written data, don't waste time to write recovery info. */ - if (!is_inode_flag_set(fi, FI_APPEND_WRITE) && + if (!is_inode_flag_set(inode, FI_APPEND_WRITE) && !exist_written_data(sbi, ino, APPEND_INO)) { /* it may call write_inode just prior to fsync */ if (need_inode_page_update(sbi, ino)) goto go_write; - if (is_inode_flag_set(fi, FI_UPDATE_WRITE) || + if (is_inode_flag_set(inode, FI_UPDATE_WRITE) || exist_written_data(sbi, ino, UPDATE_INO)) goto flush_out; goto out; } go_write: - /* guarantee free sections for fsync */ - f2fs_balance_fs(sbi); - /* * Both of fdatasync() and fsync() are able to be recovered from * sudden-power-off. */ - down_read(&fi->i_sem); + down_read(&F2FS_I(inode)->i_sem); need_cp = need_do_checkpoint(inode); - up_read(&fi->i_sem); + up_read(&F2FS_I(inode)->i_sem); if (need_cp) { /* all the dirty node pages should be flushed for POR */ @@ -253,19 +249,23 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * will be used only for fsynced inodes after checkpoint. */ try_to_fix_pino(inode); - clear_inode_flag(fi, FI_APPEND_WRITE); - clear_inode_flag(fi, FI_UPDATE_WRITE); + clear_inode_flag(inode, FI_APPEND_WRITE); + clear_inode_flag(inode, FI_UPDATE_WRITE); goto out; } sync_nodes: - sync_node_pages(sbi, ino, &wbc); - - /* if cp_error was enabled, we should avoid infinite loop */ - if (unlikely(f2fs_cp_error(sbi))) + ret = fsync_node_pages(sbi, inode, &wbc, atomic); + if (ret) goto out; + /* if cp_error was enabled, we should avoid infinite loop */ + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + goto out; + } + if (need_inode_block_update(sbi, ino)) { - mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -275,18 +275,24 @@ int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; /* once recovery info is written, don't need to tack this */ - remove_dirty_inode(sbi, ino, APPEND_INO); - clear_inode_flag(fi, FI_APPEND_WRITE); + remove_ino_entry(sbi, ino, APPEND_INO); + clear_inode_flag(inode, FI_APPEND_WRITE); flush_out: - remove_dirty_inode(sbi, ino, UPDATE_INO); - clear_inode_flag(fi, FI_UPDATE_WRITE); + remove_ino_entry(sbi, ino, UPDATE_INO); + clear_inode_flag(inode, FI_UPDATE_WRITE); ret = f2fs_issue_flush(sbi); + f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); f2fs_trace_ios(NULL, 1); return ret; } +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) +{ + return f2fs_do_sync_file(file, start, end, datasync, false); +} + static pgoff_t __get_first_dirty_index(struct address_space *mapping, pgoff_t pgofs, int whence) { @@ -300,7 +306,7 @@ static pgoff_t __get_first_dirty_index(struct address_space *mapping, pagevec_init(&pvec, 0); nr_pages = pagevec_lookup_tag(&pvec, mapping, &pgofs, PAGECACHE_TAG_DIRTY, 1); - pgofs = nr_pages ? pvec.pages[0]->index : LONG_MAX; + pgofs = nr_pages ? pvec.pages[0]->index : ULONG_MAX; pagevec_release(&pvec); return pgofs; } @@ -332,7 +338,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -345,32 +351,31 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) goto found; } - pgofs = (pgoff_t)(offset >> PAGE_CACHE_SHIFT); + pgofs = (pgoff_t)(offset >> PAGE_SHIFT); dirty = __get_first_dirty_index(inode->i_mapping, pgofs, whence); - for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + for (; data_ofs < isize; data_ofs = (loff_t)pgofs << PAGE_SHIFT) { set_new_dnode(&dn, inode, NULL, NULL, 0); - err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE_RA); + err = get_dnode_of_data(&dn, pgofs, LOOKUP_NODE); if (err && err != -ENOENT) { goto fail; } else if (err == -ENOENT) { /* direct node does not exists */ if (whence == SEEK_DATA) { - pgofs = PGOFS_OF_NEXT_DNODE(pgofs, - F2FS_I(inode)); + pgofs = get_next_page_offset(&dn, pgofs); continue; } else { goto found; } } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; dn.ofs_in_node++, pgofs++, - data_ofs = (loff_t)pgofs << PAGE_CACHE_SHIFT) { + data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); @@ -387,10 +392,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -418,19 +423,20 @@ static loff_t f2fs_llseek(struct file *file, loff_t offset, int whence) static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + int err; if (f2fs_encrypted_inode(inode)) { - int err = f2fs_get_encryption_info(inode); + err = fscrypt_get_encryption_info(inode); if (err) return 0; + if (!f2fs_encrypted_inode(inode)) + return -ENOKEY; } /* we don't need to use inline_data strictly */ - if (f2fs_has_inline_data(inode)) { - int err = f2fs_convert_inline_inode(inode); - if (err) - return err; - } + err = f2fs_convert_inline_inode(inode); + if (err) + return err; file_accessed(file); vma->vm_ops = &f2fs_file_vm_ops; @@ -440,12 +446,22 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { int ret = generic_file_open(inode, filp); + struct dentry *dir; if (!ret && f2fs_encrypted_inode(inode)) { - ret = f2fs_get_encryption_info(inode); + ret = fscrypt_get_encryption_info(inode); if (ret) - ret = -EACCES; + return -EACCES; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; } + dir = dget_parent(file_dentry(filp)); + if (f2fs_encrypted_inode(d_inode(dir)) && + !fscrypt_has_permitted_context(d_inode(dir), inode)) { + dput(dir); + return -EPERM; + } + dput(dir); return ret; } @@ -468,8 +484,7 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) set_data_blkaddr(dn); invalidate_blocks(sbi, blkaddr); if (dn->ofs_in_node == 0 && IS_INODE(dn->node_page)) - clear_inode_flag(F2FS_I(dn->inode), - FI_FIRST_BLOCK_WRITTEN); + clear_inode_flag(dn->inode, FI_FIRST_BLOCK_WRITTEN); nr_free++; } @@ -480,14 +495,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) * we will invalidate all blkaddr in the whole range. */ fofs = start_bidx_of_node(ofs_of_node(dn->node_page), - F2FS_I(dn->inode)) + ofs; + dn->inode) + ofs; f2fs_update_extent_cache_range(dn, fofs, 0, len); dec_valid_block_count(sbi, dn->inode, nr_free); - set_page_dirty(dn->node_page); - sync_inode_page(dn); } dn->ofs_in_node = ofs; + f2fs_update_time(sbi, REQ_TIME); trace_f2fs_truncate_data_blocks_range(dn->inode, dn->nid, dn->ofs_in_node, nr_free); return nr_free; @@ -501,8 +515,8 @@ void truncate_data_blocks(struct dnode_of_data *dn) static int truncate_partial_data_page(struct inode *inode, u64 from, bool cache_only) { - unsigned offset = from & (PAGE_CACHE_SIZE - 1); - pgoff_t index = from >> PAGE_CACHE_SHIFT; + unsigned offset = from & (PAGE_SIZE - 1); + pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; struct page *page; @@ -510,7 +524,7 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, return 0; if (cache_only) { - page = f2fs_grab_cache_page(mapping, index, false); + page = find_lock_page(mapping, index); if (page && PageUptodate(page)) goto truncate_out; f2fs_put_page(page, 1); @@ -521,9 +535,10 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, if (IS_ERR(page)) return 0; truncate_out: - f2fs_wait_on_page_writeback(page, DATA); - zero_user(page, offset, PAGE_CACHE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || !S_ISREG(inode->i_mode)) + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user(page, offset, PAGE_SIZE - offset); + if (!cache_only || !f2fs_encrypted_inode(inode) || + !S_ISREG(inode->i_mode)) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -543,6 +558,9 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) free_from = (pgoff_t)F2FS_BYTES_TO_BLK(from + blocksize - 1); + if (free_from >= sbi->max_file_blocks) + goto free_partial; + if (lock) f2fs_lock_op(sbi); @@ -561,14 +579,14 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } set_new_dnode(&dn, inode, ipage, NULL, 0); - err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE); + err = get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) goto free_next; goto out; } - count = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + count = ADDRS_PER_PAGE(dn.node_page, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); @@ -584,7 +602,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) out: if (lock) f2fs_unlock_op(sbi); - +free_partial: /* lastly zero out the first data page */ if (!err) err = truncate_partial_data_page(inode, from, truncate_page); @@ -593,7 +611,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) return err; } -int f2fs_truncate(struct inode *inode, bool lock) +int f2fs_truncate(struct inode *inode) { int err; @@ -604,18 +622,18 @@ int f2fs_truncate(struct inode *inode, bool lock) trace_f2fs_truncate(inode); /* we should check inline_data size */ - if (f2fs_has_inline_data(inode) && !f2fs_may_inline_data(inode)) { + if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; } - err = truncate_blocks(inode, i_size_read(inode), lock); + err = truncate_blocks(inode, i_size_read(inode), true); if (err) return err; inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -631,7 +649,6 @@ int f2fs_getattr(struct vfsmount *mnt, #ifdef CONFIG_F2FS_FS_POSIX_ACL static void __setattr_copy(struct inode *inode, const struct iattr *attr) { - struct f2fs_inode_info *fi = F2FS_I(inode); unsigned int ia_valid = attr->ia_valid; if (ia_valid & ATTR_UID) @@ -652,7 +669,7 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) if (!in_group_p(inode->i_gid) && !capable(CAP_FSETID)) mode &= ~S_ISGID; - set_acl_inode(fi, mode); + set_acl_inode(inode, mode); } } #else @@ -662,7 +679,6 @@ static void __setattr_copy(struct inode *inode, const struct iattr *attr) int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); - struct f2fs_inode_info *fi = F2FS_I(inode); int err; err = inode_change_ok(inode, attr); @@ -671,21 +687,28 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_SIZE) { if (f2fs_encrypted_inode(inode) && - f2fs_get_encryption_info(inode)) + fscrypt_get_encryption_info(inode)) return -EACCES; if (attr->ia_size <= i_size_read(inode)) { truncate_setsize(inode, attr->ia_size); - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode)); + f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is * larger than i_size. */ truncate_setsize(inode, attr->ia_size); + + /* should convert inline inode here */ + if (!f2fs_may_inline_data(inode)) { + err = f2fs_convert_inline_inode(inode); + if (err) + return err; + } inode->i_mtime = inode->i_ctime = CURRENT_TIME; } } @@ -694,13 +717,13 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (attr->ia_valid & ATTR_MODE) { err = posix_acl_chmod(inode, get_inode_mode(inode)); - if (err || is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; - clear_inode_flag(fi, FI_ACL_MODE); + if (err || is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; + clear_inode_flag(inode, FI_ACL_MODE); } } - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return err; } @@ -727,7 +750,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (!len) return 0; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); page = get_new_data_page(inode, NULL, index, false); @@ -736,7 +759,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, if (IS_ERR(page)) return PTR_ERR(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, start, len); set_page_dirty(page); f2fs_put_page(page, 1); @@ -761,7 +784,7 @@ int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(dn.node_page, F2FS_I(inode)); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -778,19 +801,17 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) { pgoff_t pg_start, pg_end; loff_t off_start, off_end; - int ret = 0; + int ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -800,7 +821,7 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; } @@ -815,10 +836,10 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) loff_t blk_start, blk_end; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); - blk_start = (loff_t)pg_start << PAGE_CACHE_SHIFT; - blk_end = (loff_t)pg_end << PAGE_CACHE_SHIFT; + blk_start = (loff_t)pg_start << PAGE_SHIFT; + blk_end = (loff_t)pg_end << PAGE_SHIFT; truncate_inode_pages_range(mapping, blk_start, blk_end - 1); @@ -831,83 +852,199 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) return ret; } -static int __exchange_data_block(struct inode *inode, pgoff_t src, - pgoff_t dst, bool full) +static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, pgoff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - block_t new_addr; - bool do_replace = false; - int ret; + int ret, done, i; +next_dnode: set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = get_dnode_of_data(&dn, src, LOOKUP_NODE_RA); + ret = get_dnode_of_data(&dn, off, LOOKUP_NODE_RA); if (ret && ret != -ENOENT) { return ret; } else if (ret == -ENOENT) { - new_addr = NULL_ADDR; - } else { - new_addr = dn.data_blkaddr; - if (!is_checkpointed_data(sbi, new_addr)) { - dn.data_blkaddr = NULL_ADDR; + if (dn.max_level == 0) + return -ENOENT; + done = min((pgoff_t)ADDRS_PER_BLOCK - dn.ofs_in_node, len); + blkaddr += done; + do_replace += done; + goto next; + } + + done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + dn.ofs_in_node, len); + for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { + *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + if (!is_checkpointed_data(sbi, *blkaddr)) { + + if (test_opt(sbi, LFS)) { + f2fs_put_dnode(&dn); + return -ENOTSUPP; + } + /* do not invalidate this block address */ - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - do_replace = true; + f2fs_update_data_blkaddr(&dn, NULL_ADDR); + *do_replace = 1; + } + } + f2fs_put_dnode(&dn); +next: + len -= done; + off += done; + if (len) + goto next_dnode; + return 0; +} + +static int __roll_back_blkaddrs(struct inode *inode, block_t *blkaddr, + int *do_replace, pgoff_t off, int len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct dnode_of_data dn; + int ret, i; + + for (i = 0; i < len; i++, do_replace++, blkaddr++) { + if (*do_replace == 0) + continue; + + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, off + i, LOOKUP_NODE_RA); + if (ret) { + dec_valid_block_count(sbi, inode, 1); + invalidate_blocks(sbi, *blkaddr); + } else { + f2fs_update_data_blkaddr(&dn, *blkaddr); } f2fs_put_dnode(&dn); } + return 0; +} - if (new_addr == NULL_ADDR) - return full ? truncate_hole(inode, dst, dst + 1) : 0; +static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, + block_t *blkaddr, int *do_replace, + pgoff_t src, pgoff_t dst, pgoff_t len, bool full) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(src_inode); + pgoff_t i = 0; + int ret; - if (do_replace) { - struct page *ipage = get_node_page(sbi, inode->i_ino); - struct node_info ni; - - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - goto err_out; + while (i < len) { + if (blkaddr[i] == NULL_ADDR && !full) { + i++; + continue; } - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, dst); - if (ret) - goto err_out; + if (do_replace[i] || blkaddr[i] == NULL_ADDR) { + struct dnode_of_data dn; + struct node_info ni; + size_t new_size; + pgoff_t ilen; - truncate_data_blocks_range(&dn, 1); + set_new_dnode(&dn, dst_inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, dst + i, ALLOC_NODE); + if (ret) + return ret; - get_node_info(sbi, dn.nid, &ni); - f2fs_replace_block(sbi, &dn, dn.data_blkaddr, new_addr, - ni.version, true); - f2fs_put_dnode(&dn); - } else { - struct page *psrc, *pdst; + get_node_info(sbi, dn.nid, &ni); + ilen = min((pgoff_t) + ADDRS_PER_PAGE(dn.node_page, dst_inode) - + dn.ofs_in_node, len - i); + do { + dn.data_blkaddr = datablock_addr(dn.node_page, + dn.ofs_in_node); + truncate_data_blocks_range(&dn, 1); - psrc = get_lock_data_page(inode, src, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = get_new_data_page(inode, NULL, dst, false); - if (IS_ERR(pdst)) { + if (do_replace[i]) { + f2fs_i_blocks_write(src_inode, + 1, false); + f2fs_i_blocks_write(dst_inode, + 1, true); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + blkaddr[i], ni.version, true, false); + + do_replace[i] = 0; + } + dn.ofs_in_node++; + i++; + new_size = (dst + i) << PAGE_SHIFT; + if (dst_inode->i_size < new_size) + f2fs_i_size_write(dst_inode, new_size); + } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + + f2fs_put_dnode(&dn); + } else { + struct page *psrc, *pdst; + + psrc = get_lock_data_page(src_inode, src + i, true); + if (IS_ERR(psrc)) + return PTR_ERR(psrc); + pdst = get_new_data_page(dst_inode, NULL, dst + i, + true); + if (IS_ERR(pdst)) { + f2fs_put_page(psrc, 1); + return PTR_ERR(pdst); + } + f2fs_copy_page(psrc, pdst); + set_page_dirty(pdst); + f2fs_put_page(pdst, 1); f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); - } - f2fs_copy_page(psrc, pdst); - set_page_dirty(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); - return truncate_hole(inode, src, src + 1); + ret = truncate_hole(src_inode, src + i, src + i + 1); + if (ret) + return ret; + i++; + } + } + return 0; +} + +static int __exchange_data_block(struct inode *src_inode, + struct inode *dst_inode, pgoff_t src, pgoff_t dst, + pgoff_t len, bool full) +{ + block_t *src_blkaddr; + int *do_replace; + pgoff_t olen; + int ret; + + while (len) { + olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); + + src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + if (!src_blkaddr) + return -ENOMEM; + + do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + if (!do_replace) { + kvfree(src_blkaddr); + return -ENOMEM; + } + + ret = __read_out_blkaddrs(src_inode, src_blkaddr, + do_replace, src, olen); + if (ret) + goto roll_back; + + ret = __clone_blkaddrs(src_inode, dst_inode, src_blkaddr, + do_replace, src, dst, olen, full); + if (ret) + goto roll_back; + + src += olen; + dst += olen; + len -= olen; + + kvfree(src_blkaddr); + kvfree(do_replace); } return 0; -err_out: - if (!get_dnode_of_data(&dn, src, LOOKUP_NODE)) { - dn.data_blkaddr = new_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - f2fs_put_dnode(&dn); - } +roll_back: + __roll_back_blkaddrs(src_inode, src_blkaddr, do_replace, src, len); + kvfree(src_blkaddr); + kvfree(do_replace); return ret; } @@ -915,16 +1052,15 @@ static int f2fs_do_collapse(struct inode *inode, pgoff_t start, pgoff_t end) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); pgoff_t nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; - int ret = 0; + int ret; - for (; end < nrpages; start++, end++) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, end, start, true); - f2fs_unlock_op(sbi); - if (ret) - break; - } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, end, start, nrpages - end, true); + f2fs_unlock_op(sbi); return ret; } @@ -941,16 +1077,12 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(F2FS_I_SB(inode)); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } - - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -972,7 +1104,50 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) ret = truncate_blocks(inode, new_size, true); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); + + return ret; +} + +static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, + pgoff_t end) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + pgoff_t index = start; + unsigned int ofs_in_node = dn->ofs_in_node; + blkcnt_t count = 0; + int ret; + + for (; index < end; index++, dn->ofs_in_node++) { + if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + count++; + } + + dn->ofs_in_node = ofs_in_node; + ret = reserve_new_blocks(dn, count); + if (ret) + return ret; + + dn->ofs_in_node = ofs_in_node; + for (index = start; index < end; index++, dn->ofs_in_node++) { + dn->data_blkaddr = + datablock_addr(dn->node_page, dn->ofs_in_node); + /* + * reserve_new_blocks will not guarantee entire block + * allocation. + */ + if (dn->data_blkaddr == NULL_ADDR) { + ret = -ENOSPC; + break; + } + if (dn->data_blkaddr != NEW_ADDR) { + invalidate_blocks(sbi, dn->data_blkaddr); + dn->data_blkaddr = NEW_ADDR; + set_data_blkaddr(dn); + } + } + + f2fs_update_extent_cache_range(dn, start, 0, index - start); return ret; } @@ -991,13 +1166,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; - f2fs_balance_fs(sbi); - - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) @@ -1005,11 +1176,11 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, truncate_pagecache_range(inode, offset, offset + len - 1); - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; + pg_start = ((unsigned long long) offset) >> PAGE_SHIFT; + pg_end = ((unsigned long long) offset + len) >> PAGE_SHIFT; - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); + off_start = offset & (PAGE_SIZE - 1); + off_end = (offset + len) & (PAGE_SIZE - 1); if (pg_start == pg_end) { ret = fill_zero(inode, pg_start, off_start, @@ -1023,48 +1194,40 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, - PAGE_CACHE_SIZE - off_start); + PAGE_SIZE - off_start); if (ret) return ret; new_size = max_t(loff_t, new_size, - (loff_t)pg_start << PAGE_CACHE_SHIFT); + (loff_t)pg_start << PAGE_SHIFT); } - for (index = pg_start; index < pg_end; index++) { + for (index = pg_start; index < pg_end;) { struct dnode_of_data dn; - struct page *ipage; + unsigned int end_offset; + pgoff_t end; f2fs_lock_op(sbi); - ipage = get_node_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - ret = PTR_ERR(ipage); - f2fs_unlock_op(sbi); - goto out; - } - - set_new_dnode(&dn, inode, ipage, NULL, 0); - ret = f2fs_reserve_block(&dn, index); + set_new_dnode(&dn, inode, NULL, NULL, 0); + ret = get_dnode_of_data(&dn, index, ALLOC_NODE); if (ret) { f2fs_unlock_op(sbi); goto out; } - if (dn.data_blkaddr != NEW_ADDR) { - invalidate_blocks(sbi, dn.data_blkaddr); + end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end = min(pg_end, end_offset - dn.ofs_in_node + index); - dn.data_blkaddr = NEW_ADDR; - set_data_blkaddr(&dn); - - dn.data_blkaddr = NULL_ADDR; - f2fs_update_extent_cache(&dn); - } + ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + if (ret) + goto out; + index = end; new_size = max_t(loff_t, new_size, - (loff_t)(index + 1) << PAGE_CACHE_SHIFT); + (loff_t)index << PAGE_SHIFT); } if (off_end) { @@ -1077,11 +1240,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, } out: - if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1089,7 +1249,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t pg_start, pg_end, delta, nrpages, idx; + pgoff_t nr, pg_start, pg_end, delta, idx; loff_t new_size; int ret = 0; @@ -1104,13 +1264,11 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (offset & (F2FS_BLKSIZE - 1) || len & (F2FS_BLKSIZE - 1)) return -EINVAL; - f2fs_balance_fs(sbi); + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; - } + f2fs_balance_fs(sbi, true); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) @@ -1123,17 +1281,23 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); - pg_start = offset >> PAGE_CACHE_SHIFT; - pg_end = (offset + len) >> PAGE_CACHE_SHIFT; + pg_start = offset >> PAGE_SHIFT; + pg_end = (offset + len) >> PAGE_SHIFT; delta = pg_end - pg_start; - nrpages = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + idx = (i_size_read(inode) + PAGE_SIZE - 1) / PAGE_SIZE; + + while (!ret && idx > pg_start) { + nr = idx - pg_start; + if (nr > delta) + nr = delta; + idx -= nr; - for (idx = nrpages - 1; idx >= pg_start && idx != -1; idx--) { f2fs_lock_op(sbi); - ret = __exchange_data_block(inode, idx, idx + delta, false); + f2fs_drop_extent_tree(inode); + + ret = __exchange_data_block(inode, inode, idx, + idx + delta, nr, false); f2fs_unlock_op(sbi); - if (ret) - break; } /* write out all moved pages, if possible */ @@ -1141,7 +1305,7 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) truncate_pagecache(inode, offset); if (!ret) - i_size_write(inode, new_size); + f2fs_i_size_write(inode, new_size); return ret; } @@ -1149,60 +1313,48 @@ static int expand_inode_data(struct inode *inode, loff_t offset, loff_t len, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - pgoff_t index, pg_start, pg_end; + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + pgoff_t pg_end; loff_t new_size = i_size_read(inode); - loff_t off_start, off_end; - int ret = 0; - - f2fs_balance_fs(sbi); + loff_t off_end; + int ret; ret = inode_newsize_ok(inode, (len + offset)); if (ret) return ret; - if (f2fs_has_inline_data(inode)) { - ret = f2fs_convert_inline_inode(inode); - if (ret) + ret = f2fs_convert_inline_inode(inode); + if (ret) + return ret; + + f2fs_balance_fs(sbi, true); + + pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; + off_end = (offset + len) & (PAGE_SIZE - 1); + + map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; + map.m_len = pg_end - map.m_lblk; + if (off_end) + map.m_len++; + + ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (ret) { + pgoff_t last_off; + + if (!map.m_len) return ret; + + last_off = map.m_lblk + map.m_len - 1; + + /* update new size to the failed position */ + new_size = (last_off == pg_end) ? offset + len: + (loff_t)(last_off + 1) << PAGE_SHIFT; + } else { + new_size = ((loff_t)pg_end << PAGE_SHIFT) + off_end; } - pg_start = ((unsigned long long) offset) >> PAGE_CACHE_SHIFT; - pg_end = ((unsigned long long) offset + len) >> PAGE_CACHE_SHIFT; - - off_start = offset & (PAGE_CACHE_SIZE - 1); - off_end = (offset + len) & (PAGE_CACHE_SIZE - 1); - - f2fs_lock_op(sbi); - - for (index = pg_start; index <= pg_end; index++) { - struct dnode_of_data dn; - - if (index == pg_end && !off_end) - goto noalloc; - - set_new_dnode(&dn, inode, NULL, NULL, 0); - ret = f2fs_reserve_block(&dn, index); - if (ret) - break; -noalloc: - if (pg_start == pg_end) - new_size = offset + len; - else if (index == pg_start && off_start) - new_size = (loff_t)(index + 1) << PAGE_CACHE_SHIFT; - else if (index == pg_end) - new_size = ((loff_t)index << PAGE_CACHE_SHIFT) + - off_end; - else - new_size += PAGE_CACHE_SIZE; - } - - if (!(mode & FALLOC_FL_KEEP_SIZE) && - i_size_read(inode) < new_size) { - i_size_write(inode, new_size); - mark_inode_dirty(inode); - update_inode_page(inode); - } - f2fs_unlock_op(sbi); + if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) + f2fs_i_size_write(inode, new_size); return ret; } @@ -1226,7 +1378,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1245,11 +1397,12 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1257,13 +1410,22 @@ static long f2fs_fallocate(struct file *file, int mode, static int f2fs_release_file(struct inode *inode, struct file *filp) { + /* + * f2fs_relase_file is called at every close calls. So we should + * not drop any inmemory pages by close called by other process. + */ + if (!(filp->f_mode & FMODE_WRITE) || + atomic_read(&inode->i_writecount) != 1) + return 0; + /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { - set_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_VOLATILE_FILE); + set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); - clear_inode_flag(F2FS_I(inode), FI_DROP_CACHE); + clear_inode_flag(inode, FI_DROP_CACHE); } return 0; } @@ -1293,33 +1455,29 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int flags = fi->i_flags & FS_FL_USER_VISIBLE; + unsigned int flags; unsigned int oldflags; int ret; + if (!inode_owner_or_capable(inode)) + return -EACCES; + + if (get_user(flags, (int __user *)arg)) + return -EFAULT; + ret = mnt_want_write_file(filp); if (ret) return ret; - if (!inode_owner_or_capable(inode)) { - ret = -EACCES; - goto out; - } - - if (get_user(flags, (int __user *)arg)) { - ret = -EFAULT; - goto out; - } - flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1328,11 +1486,10 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); - f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(inode); + f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); return ret; @@ -1353,17 +1510,35 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - f2fs_balance_fs(F2FS_I_SB(inode)); - - if (f2fs_is_atomic_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_atomic_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_ATOMIC_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); + + if (!get_dirty_pages(inode)) + goto out; + + f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, + "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + inode->i_ino, get_dirty_pages(inode)); + ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); + if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_commit_atomic_write(struct file *filp) @@ -1374,22 +1549,27 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - ret = mnt_want_write_file(filp); if (ret) return ret; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto err_out; + if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - ret = commit_inmem_pages(inode, false); - if (ret) + clear_inode_flag(inode, FI_ATOMIC_FILE); + ret = commit_inmem_pages(inode); + if (ret) { + set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; + } } - ret = f2fs_sync_file(filp, 0, LLONG_MAX, 0); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1402,31 +1582,54 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; - if (f2fs_is_volatile_file(inode)) - return 0; - - ret = f2fs_convert_inline_inode(inode); + ret = mnt_want_write_file(filp); if (ret) return ret; - set_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - return 0; + inode_lock(inode); + + if (f2fs_is_volatile_file(inode)) + goto out; + + ret = f2fs_convert_inline_inode(inode); + if (ret) + goto out; + + set_inode_flag(inode, FI_VOLATILE_FILE); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_release_volatile_write(struct file *filp) { struct inode *inode = file_inode(filp); + int ret; if (!inode_owner_or_capable(inode)) return -EACCES; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + inode_lock(inode); + if (!f2fs_is_volatile_file(inode)) - return 0; + goto out; - if (!f2fs_is_first_block_written(inode)) - return truncate_partial_data_page(inode, 0, true); + if (!f2fs_is_first_block_written(inode)) { + ret = truncate_partial_data_page(inode, 0, true); + goto out; + } - return punch_hole(inode, 0, F2FS_BLKSIZE); + ret = punch_hole(inode, 0, F2FS_BLKSIZE); +out: + inode_unlock(inode); + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_abort_volatile_write(struct file *filp) @@ -1441,13 +1644,19 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) if (ret) return ret; - f2fs_balance_fs(F2FS_I_SB(inode)); + inode_lock(inode); - clear_inode_flag(F2FS_I(inode), FI_ATOMIC_FILE); - clear_inode_flag(F2FS_I(inode), FI_VOLATILE_FILE); - commit_inmem_pages(inode, true); + if (f2fs_is_atomic_file(inode)) + drop_inmem_pages(inode); + if (f2fs_is_volatile_file(inode)) { + clear_inode_flag(inode, FI_VOLATILE_FILE); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + } + + inode_unlock(inode); mnt_drop_write_file(filp); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return ret; } @@ -1457,6 +1666,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct super_block *sb = sbi->sb; __u32 in; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1464,30 +1674,38 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) if (get_user(in, (__u32 __user *)arg)) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + switch (in) { case F2FS_GOING_DOWN_FULLSYNC: sb = freeze_bdev(sb->s_bdev); if (sb && !IS_ERR(sb)) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); thaw_bdev(sb->s_bdev, sb); } break; case F2FS_GOING_DOWN_METASYNC: /* do checkpoint only */ f2fs_sync_fs(sb, 1); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_NOSYNC: - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: sync_meta_pages(sbi, META, LONG_MAX); - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); break; default: - return -EINVAL; + ret = -EINVAL; + goto out; } - return 0; + f2fs_update_time(sbi, REQ_TIME); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -1508,15 +1726,21 @@ static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) sizeof(range))) return -EFAULT; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + range.minlen = max((unsigned int)range.minlen, q->limits.discard_granularity); ret = f2fs_trim_fs(F2FS_SB(sb), &range); + mnt_drop_write_file(filp); if (ret < 0) return ret; if (copy_to_user((struct fstrim_range __user *)arg, &range, sizeof(range))) return -EFAULT; + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); return 0; } @@ -1532,45 +1756,31 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - int err; - if (copy_from_user(&policy, (struct f2fs_encryption_policy __user *)arg, - sizeof(policy))) + if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, + sizeof(policy))) return -EFAULT; - mutex_lock(&inode->i_mutex); + f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - err = f2fs_process_policy(&policy, inode); - - mutex_unlock(&inode->i_mutex); - - return err; -#else - return -EOPNOTSUPP; -#endif + return fscrypt_process_policy(filp, &policy); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { -#ifdef CONFIG_F2FS_FS_ENCRYPTION - struct f2fs_encryption_policy policy; + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int err; - err = f2fs_get_policy(inode, &policy); + err = fscrypt_get_policy(inode, &policy); if (err) return err; - if (copy_to_user((struct f2fs_encryption_policy __user *)arg, &policy, - sizeof(policy))) + if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) return -EFAULT; return 0; -#else - return -EOPNOTSUPP; -#endif } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1593,13 +1803,13 @@ static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) generate_random_uuid(sbi->raw_super->encrypt_pw_salt); err = f2fs_commit_super(sbi, false); - - mnt_drop_write_file(filp); if (err) { /* undo new data */ memset(sbi->raw_super->encrypt_pw_salt, 0, 16); + mnt_drop_write_file(filp); return err; } + mnt_drop_write_file(filp); got_it: if (copy_to_user((__u8 __user *)arg, sbi->raw_super->encrypt_pw_salt, 16)) @@ -1612,6 +1822,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); __u32 sync; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1622,21 +1833,30 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; + ret = mnt_want_write_file(filp); + if (ret) + return ret; + if (!sync) { - if (!mutex_trylock(&sbi->gc_mutex)) - return -EBUSY; + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } } else { mutex_lock(&sbi->gc_mutex); } - return f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync); +out: + mnt_drop_write_file(filp); + return ret; } static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct cp_control cpc; + int ret; if (!capable(CAP_SYS_ADMIN)) return -EPERM; @@ -1644,13 +1864,343 @@ static int f2fs_ioc_write_checkpoint(struct file *filp, unsigned long arg) if (f2fs_readonly(sbi->sb)) return -EROFS; - cpc.reason = __get_cp_reason(sbi); + ret = mnt_want_write_file(filp); + if (ret) + return ret; - mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); - mutex_unlock(&sbi->gc_mutex); + ret = f2fs_sync_fs(sbi->sb, 1); - return 0; + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_defragment_range(struct f2fs_sb_info *sbi, + struct file *filp, + struct f2fs_defragment *range) +{ + struct inode *inode = file_inode(filp); + struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; + struct extent_info ei; + pgoff_t pg_start, pg_end; + unsigned int blk_per_seg = sbi->blocks_per_seg; + unsigned int total = 0, sec_num; + unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; + block_t blk_end = 0; + bool fragmented = false; + int err; + + /* if in-place-update policy is enabled, don't waste time here */ + if (need_inplace_update(inode)) + return -EINVAL; + + pg_start = range->start >> PAGE_SHIFT; + pg_end = (range->start + range->len) >> PAGE_SHIFT; + + f2fs_balance_fs(sbi, true); + + inode_lock(inode); + + /* writeback all dirty pages in the range */ + err = filemap_write_and_wait_range(inode->i_mapping, range->start, + range->start + range->len - 1); + if (err) + goto out; + + /* + * lookup mapping info in extent cache, skip defragmenting if physical + * block addresses are continuous. + */ + if (f2fs_lookup_extent_cache(inode, pg_start, &ei)) { + if (ei.fofs + ei.len >= pg_end) + goto out; + } + + map.m_lblk = pg_start; + + /* + * lookup mapping info in dnode page cache, skip defragmenting if all + * physical block addresses are continuous even if there are hole(s) + * in logical blocks. + */ + while (map.m_lblk < pg_end) { + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + if (blk_end && blk_end != map.m_pblk) { + fragmented = true; + break; + } + blk_end = map.m_pblk + map.m_len; + + map.m_lblk += map.m_len; + } + + if (!fragmented) + goto out; + + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; + + sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + + /* + * make sure there are enough free section for LFS allocation, this can + * avoid defragment running in SSR mode when free section are allocated + * intensively + */ + if (has_not_enough_free_secs(sbi, 0, sec_num)) { + err = -EAGAIN; + goto out; + } + + while (map.m_lblk < pg_end) { + pgoff_t idx; + int cnt = 0; + +do_map: + map.m_len = pg_end - map.m_lblk; + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + if (err) + goto clear_out; + + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + map.m_lblk++; + continue; + } + + set_inode_flag(inode, FI_DO_DEFRAG); + + idx = map.m_lblk; + while (idx < map.m_lblk + map.m_len && cnt < blk_per_seg) { + struct page *page; + + page = get_lock_data_page(inode, idx, true); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto clear_out; + } + + set_page_dirty(page); + f2fs_put_page(page, 1); + + idx++; + cnt++; + total++; + } + + map.m_lblk = idx; + + if (idx < pg_end && cnt < blk_per_seg) + goto do_map; + + clear_inode_flag(inode, FI_DO_DEFRAG); + + err = filemap_fdatawrite(inode->i_mapping); + if (err) + goto out; + } +clear_out: + clear_inode_flag(inode, FI_DO_DEFRAG); +out: + inode_unlock(inode); + if (!err) + range->len = (u64)total << PAGE_SHIFT; + return err; +} + +static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_defragment range; + int err; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + + err = mnt_want_write_file(filp); + if (err) + return err; + + if (f2fs_readonly(sbi->sb)) { + err = -EROFS; + goto out; + } + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) { + err = -EFAULT; + goto out; + } + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || + range.len & (F2FS_BLKSIZE - 1)) { + err = -EINVAL; + goto out; + } + + err = f2fs_defragment_range(sbi, filp, &range); + f2fs_update_time(sbi, REQ_TIME); + if (err < 0) + goto out; + + if (copy_to_user((struct f2fs_defragment __user *)arg, &range, + sizeof(range))) + err = -EFAULT; +out: + mnt_drop_write_file(filp); + return err; +} + +static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, + struct file *file_out, loff_t pos_out, size_t len) +{ + struct inode *src = file_inode(file_in); + struct inode *dst = file_inode(file_out); + struct f2fs_sb_info *sbi = F2FS_I_SB(src); + size_t olen = len, dst_max_i_size = 0; + size_t dst_osize; + int ret; + + if (file_in->f_path.mnt != file_out->f_path.mnt || + src->i_sb != dst->i_sb) + return -EXDEV; + + if (unlikely(f2fs_readonly(src->i_sb))) + return -EROFS; + + if (!S_ISREG(src->i_mode) || !S_ISREG(dst->i_mode)) + return -EINVAL; + + if (f2fs_encrypted_inode(src) || f2fs_encrypted_inode(dst)) + return -EOPNOTSUPP; + + if (src == dst) { + if (pos_in == pos_out) + return 0; + if (pos_out > pos_in && pos_out < pos_in + len) + return -EINVAL; + } + + inode_lock(src); + if (src != dst) { + if (!inode_trylock(dst)) { + ret = -EBUSY; + goto out; + } + } + + ret = -EINVAL; + if (pos_in + len > src->i_size || pos_in + len < pos_in) + goto out_unlock; + if (len == 0) + olen = len = src->i_size - pos_in; + if (pos_in + len == src->i_size) + len = ALIGN(src->i_size, F2FS_BLKSIZE) - pos_in; + if (len == 0) { + ret = 0; + goto out_unlock; + } + + dst_osize = dst->i_size; + if (pos_out + olen > dst->i_size) + dst_max_i_size = pos_out + olen; + + /* verify the end result is block aligned */ + if (!IS_ALIGNED(pos_in, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_in + len, F2FS_BLKSIZE) || + !IS_ALIGNED(pos_out, F2FS_BLKSIZE)) + goto out_unlock; + + ret = f2fs_convert_inline_inode(src); + if (ret) + goto out_unlock; + + ret = f2fs_convert_inline_inode(dst); + if (ret) + goto out_unlock; + + /* write out all dirty pages from offset */ + ret = filemap_write_and_wait_range(src->i_mapping, + pos_in, pos_in + len); + if (ret) + goto out_unlock; + + ret = filemap_write_and_wait_range(dst->i_mapping, + pos_out, pos_out + len); + if (ret) + goto out_unlock; + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + ret = __exchange_data_block(src, dst, pos_in >> F2FS_BLKSIZE_BITS, + pos_out >> F2FS_BLKSIZE_BITS, + len >> F2FS_BLKSIZE_BITS, false); + + if (!ret) { + if (dst_max_i_size) + f2fs_i_size_write(dst, dst_max_i_size); + else if (dst_osize != dst->i_size) + f2fs_i_size_write(dst, dst_osize); + } + f2fs_unlock_op(sbi); +out_unlock: + if (src != dst) + inode_unlock(dst); +out: + inode_unlock(src); + return ret; +} + +static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) +{ + struct f2fs_move_range range; + struct fd dst; + int err; + + if (!(filp->f_mode & FMODE_READ) || + !(filp->f_mode & FMODE_WRITE)) + return -EBADF; + + if (copy_from_user(&range, (struct f2fs_move_range __user *)arg, + sizeof(range))) + return -EFAULT; + + dst = fdget(range.dst_fd); + if (!dst.file) + return -EBADF; + + if (!(dst.file->f_mode & FMODE_WRITE)) { + err = -EBADF; + goto err_out; + } + + err = mnt_want_write_file(filp); + if (err) + goto err_out; + + err = f2fs_move_file_range(filp, range.pos_in, dst.file, + range.pos_out, range.len); + + mnt_drop_write_file(filp); + + if (copy_to_user((struct f2fs_move_range __user *)arg, + &range, sizeof(range))) + err = -EFAULT; +err_out: + fdput(dst); + return err; } long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) @@ -1686,6 +2236,10 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_gc(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); + case F2FS_IOC_DEFRAGMENT: + return f2fs_ioc_defragment(filp, arg); + case F2FS_IOC_MOVE_RANGE: + return f2fs_ioc_move_range(filp, arg); default: return -ENOTTY; } @@ -1693,14 +2247,36 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - struct inode *inode = file_inode(iocb->ki_filp); + struct file *file = iocb->ki_filp; + struct inode *inode = file_inode(file); + struct blk_plug plug; + ssize_t ret; if (f2fs_encrypted_inode(inode) && - !f2fs_has_encryption_key(inode) && - f2fs_get_encryption_info(inode)) + !fscrypt_has_encryption_key(inode) && + fscrypt_get_encryption_info(inode)) return -EACCES; - return generic_file_write_iter(iocb, from); + inode_lock(inode); + ret = generic_write_checks(iocb, from); + if (ret > 0) { + ret = f2fs_preallocate_blocks(iocb, from); + if (!ret) { + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); + } + } + inode_unlock(inode); + + if (ret > 0) { + ssize_t err; + + err = generic_write_sync(file, iocb->ki_pos - ret, ret); + if (err < 0) + ret = err; + } + return ret; } #ifdef CONFIG_COMPAT @@ -1713,6 +2289,24 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC32_SETFLAGS: cmd = F2FS_IOC_SETFLAGS; break; + case F2FS_IOC32_GETVERSION: + cmd = F2FS_IOC_GETVERSION; + break; + case F2FS_IOC_START_ATOMIC_WRITE: + case F2FS_IOC_COMMIT_ATOMIC_WRITE: + case F2FS_IOC_START_VOLATILE_WRITE: + case F2FS_IOC_RELEASE_VOLATILE_WRITE: + case F2FS_IOC_ABORT_VOLATILE_WRITE: + case F2FS_IOC_SHUTDOWN: + case F2FS_IOC_SET_ENCRYPTION_POLICY: + case F2FS_IOC_GET_ENCRYPTION_PWSALT: + case F2FS_IOC_GET_ENCRYPTION_POLICY: + case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_WRITE_CHECKPOINT: + case F2FS_IOC_DEFRAGMENT: + break; + case F2FS_IOC_MOVE_RANGE: + break; default: return -ENOIOCTLCMD; } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index fedbf67a0842..0a0a1ad1fe1f 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "node.h" @@ -48,6 +47,11 @@ static int gc_thread_func(void *data) continue; } +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -97,7 +101,7 @@ int start_gc_thread(struct f2fs_sb_info *sbi) dev_t dev = sbi->sb->s_bdev->bd_dev; int err = 0; - gc_th = kmalloc(sizeof(struct f2fs_gc_kthread), GFP_KERNEL); + gc_th = f2fs_kmalloc(sbi, sizeof(struct f2fs_gc_kthread), GFP_KERNEL); if (!gc_th) { err = -ENOMEM; goto out; @@ -173,9 +177,9 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, { /* SSR allocates in a segment unit */ if (p->alloc_mode == SSR) - return 1 << sbi->log_blocks_per_seg; + return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return (1 << sbi->log_blocks_per_seg) * p->ofs_unit; + return sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -246,6 +250,18 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, return get_cb_cost(sbi, segno); } +static unsigned int count_bits(const unsigned long *addr, + unsigned int offset, unsigned int len) +{ + unsigned int end = offset + len, sum = 0; + + while (offset < end) { + if (test_bit(offset++, addr)) + ++sum; + } + return sum; +} + /* * This function is called from two paths. * One is garbage collection and the other is SSR segment selection. @@ -259,9 +275,9 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct victim_sel_policy p; - unsigned int secno, max_cost; + unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); - int nsearched = 0; + unsigned int nsearched = 0; mutex_lock(&dirty_i->seglist_lock); @@ -269,11 +285,12 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, select_policy(sbi, gc_type, type, &p); p.min_segno = NULL_SEGNO; - p.min_cost = max_cost = get_max_cost(sbi, &p); + p.min_cost = get_max_cost(sbi, &p); if (p.max_search == 0) goto out; + last_victim = sbi->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -296,27 +313,35 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } p.offset = segno + p.ofs_unit; - if (p.ofs_unit > 1) + if (p.ofs_unit > 1) { p.offset -= segno % p.ofs_unit; + nsearched += count_bits(p.dirty_segmap, + p.offset - p.ofs_unit, + p.ofs_unit); + } else { + nsearched++; + } + secno = GET_SECNO(sbi, segno); if (sec_usage_check(sbi, secno)) - continue; + goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) - continue; + goto next; cost = get_gc_cost(sbi, segno, &p); if (p.min_cost > cost) { p.min_segno = segno; p.min_cost = cost; - } else if (unlikely(cost == max_cost)) { - continue; } - - if (nsearched++ >= p.max_search) { - sbi->last_victim[p.gc_mode] = segno; +next: + if (nsearched >= p.max_search) { + if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) + sbi->last_victim[p.gc_mode] = last_victim + 1; + else + sbi->last_victim[p.gc_mode] = segno + 1; break; } } @@ -400,13 +425,13 @@ static int check_valid_map(struct f2fs_sb_info *sbi, * On validity, copy that node with cold status, otherwise (invalid node) * ignore that. */ -static int gc_node_segment(struct f2fs_sb_info *sbi, +static void gc_node_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, unsigned int segno, int gc_type) { - bool initial = true; struct f2fs_summary *entry; block_t start_addr; int off; + int phase = 0; start_addr = START_BLOCK(sbi, segno); @@ -419,16 +444,24 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, struct node_info ni; /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; - if (initial) { + if (phase == 0) { + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { ra_node_page(sbi, nid); continue; } + + /* phase == 2 */ node_page = get_node_page(sbi, nid); if (IS_ERR(node_page)) continue; @@ -445,36 +478,12 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, continue; } - /* set page dirty and write it */ - if (gc_type == FG_GC) { - f2fs_wait_on_page_writeback(node_page, NODE); - set_page_dirty(node_page); - } else { - if (!PageWriteback(node_page)) - set_page_dirty(node_page); - } - f2fs_put_page(node_page, 1); + move_node_page(node_page, gc_type); stat_inc_node_blk_count(sbi, 1, gc_type); } - if (initial) { - initial = false; + if (++phase < 3) goto next_step; - } - - if (gc_type == FG_GC) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .for_reclaim = 0, - }; - sync_node_pages(sbi, 0, &wbc); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } /* @@ -484,7 +493,7 @@ static int gc_node_segment(struct f2fs_sb_info *sbi, * as indirect or double indirect node blocks, are given, it must be a caller's * bug. */ -block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode) { unsigned int indirect_blks = 2 * NIDS_PER_BLOCK + 4; unsigned int bidx; @@ -501,7 +510,7 @@ block_t start_bidx_of_node(unsigned int node_ofs, struct f2fs_inode_info *fi) int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); bidx = node_ofs - 5 - dec; } - return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(fi); + return bidx * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode); } static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -547,6 +556,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) struct f2fs_summary sum; struct node_info ni; struct page *page; + block_t newaddr; int err; /* do not read out */ @@ -568,21 +578,24 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); get_node_info(fio.sbi, dn.nid, &ni); set_summary(&sum, dn.nid, dn.ofs_in_node, ni.version); /* read page */ fio.page = page; - fio.blk_addr = dn.data_blkaddr; + fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), - fio.blk_addr, - FGP_LOCK|FGP_CREAT, - GFP_NOFS); - if (!fio.encrypted_page) - goto put_out; + allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, + &sum, CURSEG_COLD_DATA); + + fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, + FGP_LOCK | FGP_CREAT, GFP_NOFS); + if (!fio.encrypted_page) { + err = -ENOMEM; + goto recover_block; + } err = f2fs_submit_page_bio(&fio); if (err) @@ -591,33 +604,39 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) /* write page */ lock_page(fio.encrypted_page); - if (unlikely(!PageUptodate(fio.encrypted_page))) + if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) { + err = -EIO; goto put_page_out; - if (unlikely(fio.encrypted_page->mapping != META_MAPPING(fio.sbi))) + } + if (unlikely(!PageUptodate(fio.encrypted_page))) { + err = -EIO; goto put_page_out; + } set_page_dirty(fio.encrypted_page); - f2fs_wait_on_page_writeback(fio.encrypted_page, DATA); + f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true); if (clear_page_dirty_for_io(fio.encrypted_page)) dec_page_count(fio.sbi, F2FS_DIRTY_META); set_page_writeback(fio.encrypted_page); /* allocate block address */ - f2fs_wait_on_page_writeback(dn.node_page, NODE); - allocate_data_block(fio.sbi, NULL, fio.blk_addr, - &fio.blk_addr, &sum, CURSEG_COLD_DATA); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); + fio.rw = WRITE_SYNC; + fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); - dn.data_blkaddr = fio.blk_addr; - set_data_blkaddr(&dn); - f2fs_update_extent_cache(&dn); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); + f2fs_update_data_blkaddr(&dn, newaddr); + set_inode_flag(inode, FI_APPEND_WRITE); if (page->index == 0) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); put_page_out: f2fs_put_page(fio.encrypted_page, 1); +recover_block: + if (err) + __f2fs_replace_block(fio.sbi, &sum, newaddr, fio.old_blkaddr, + true, true); put_out: f2fs_put_dnode(&dn); out: @@ -645,12 +664,23 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) .page = page, .encrypted_page = NULL, }; + bool is_dirty = PageDirty(page); + int err; + +retry: set_page_dirty(page); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (clear_page_dirty_for_io(page)) inode_dec_dirty_pages(inode); + set_cold_data(page); - do_write_data_page(&fio); + + err = do_write_data_page(&fio); + if (err == -ENOMEM && is_dirty) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + clear_cold_data(page); } out: @@ -664,7 +694,7 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) * If the parent node is not valid or the data block address is different, * the victim data block is ignored. */ -static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, +static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct gc_inode_list *gc_list, unsigned int segno, int gc_type) { struct super_block *sb = sbi->sb; @@ -684,16 +714,23 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; block_t start_bidx; + nid_t nid = le32_to_cpu(entry->nid); /* stop BG_GC if there is not enough free sections. */ - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0)) - return 0; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) + return; if (check_valid_map(sbi, segno, off) == 0) continue; if (phase == 0) { - ra_node_page(sbi, le32_to_cpu(entry->nid)); + ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), 1, + META_NAT, true); + continue; + } + + if (phase == 1) { + ra_node_page(sbi, nid); continue; } @@ -701,14 +738,14 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (!is_alive(sbi, entry, &dni, start_addr + off, &nofs)) continue; - if (phase == 1) { + if (phase == 2) { ra_node_page(sbi, dni.ino); continue; } ofs_in_node = le16_to_cpu(entry->ofs_in_node); - if (phase == 2) { + if (phase == 3) { inode = f2fs_iget(sb, dni.ino); if (IS_ERR(inode) || is_bad_inode(inode)) continue; @@ -720,7 +757,7 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)); + start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, start_bidx + ofs_in_node, READA, true); if (IS_ERR(data_page)) { @@ -733,30 +770,41 @@ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } - /* phase 3 */ + /* phase 4 */ inode = find_gc_inode(gc_list, dni.ino); if (inode) { - start_bidx = start_bidx_of_node(nofs, F2FS_I(inode)) + struct f2fs_inode_info *fi = F2FS_I(inode); + bool locked = false; + + if (S_ISREG(inode->i_mode)) { + if (!down_write_trylock(&fi->dio_rwsem[READ])) + continue; + if (!down_write_trylock( + &fi->dio_rwsem[WRITE])) { + up_write(&fi->dio_rwsem[READ]); + continue; + } + locked = true; + } + + start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) move_encrypted_block(inode, start_bidx); else move_data_page(inode, start_bidx, gc_type); + + if (locked) { + up_write(&fi->dio_rwsem[WRITE]); + up_write(&fi->dio_rwsem[READ]); + } + stat_inc_data_blk_count(sbi, 1, gc_type); } } - if (++phase < 4) + if (++phase < 5) goto next_step; - - if (gc_type == FG_GC) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - - /* return 1 only if FG_GC succefully reclaimed one */ - if (get_valid_blocks(sbi, segno, 1) == 0) - return 1; - } - return 0; } static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, @@ -772,51 +820,84 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, return ret; } -static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int segno, +static int do_garbage_collect(struct f2fs_sb_info *sbi, + unsigned int start_segno, struct gc_inode_list *gc_list, int gc_type) { struct page *sum_page; struct f2fs_summary_block *sum; struct blk_plug plug; - int nfree = 0; + unsigned int segno = start_segno; + unsigned int end_segno = start_segno + sbi->segs_per_sec; + int sec_freed = 0; + unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + SUM_TYPE_DATA : SUM_TYPE_NODE; - /* read segment summary of victim */ - sum_page = get_sum_page(sbi, segno); + /* readahead multi ssa blocks those have contiguous address */ + if (sbi->segs_per_sec > 1) + ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), + sbi->segs_per_sec, META_SSA, true); + + /* reference all summary page */ + while (segno < end_segno) { + sum_page = get_sum_page(sbi, segno++); + unlock_page(sum_page); + } blk_start_plug(&plug); - sum = page_address(sum_page); + for (segno = start_segno; segno < end_segno; segno++) { - /* - * this is to avoid deadlock: - * - lock_page(sum_page) - f2fs_replace_block - * - check_valid_map() - mutex_lock(sentry_lock) - * - mutex_lock(sentry_lock) - change_curseg() - * - lock_page(sum_page) - */ - unlock_page(sum_page); + if (get_valid_blocks(sbi, segno, 1) == 0 || + unlikely(f2fs_cp_error(sbi))) + goto next; - switch (GET_SUM_TYPE((&sum->footer))) { - case SUM_TYPE_NODE: - nfree = gc_node_segment(sbi, sum->entries, segno, gc_type); - break; - case SUM_TYPE_DATA: - nfree = gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); - break; + /* find segment summary of victim */ + sum_page = find_get_page(META_MAPPING(sbi), + GET_SUM_BLOCK(sbi, segno)); + f2fs_bug_on(sbi, !PageUptodate(sum_page)); + f2fs_put_page(sum_page, 0); + + sum = page_address(sum_page); + f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); + + /* + * this is to avoid deadlock: + * - lock_page(sum_page) - f2fs_replace_block + * - check_valid_map() - mutex_lock(sentry_lock) + * - mutex_lock(sentry_lock) - change_curseg() + * - lock_page(sum_page) + */ + + if (type == SUM_TYPE_NODE) + gc_node_segment(sbi, sum->entries, segno, gc_type); + else + gc_data_segment(sbi, sum->entries, gc_list, segno, + gc_type); + + stat_inc_seg_count(sbi, type, gc_type); +next: + f2fs_put_page(sum_page, 0); } + + if (gc_type == FG_GC) + f2fs_submit_merged_bio(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + blk_finish_plug(&plug); - stat_inc_seg_count(sbi, GET_SUM_TYPE((&sum->footer)), gc_type); + if (gc_type == FG_GC && + get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) + sec_freed = 1; + stat_inc_call_count(sbi->stat_info); - f2fs_put_page(sum_page, 0); - return nfree; + return sec_freed; } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) { - unsigned int segno, i; + unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0; int ret = -EINVAL; @@ -832,46 +913,48 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) goto stop; - if (unlikely(f2fs_cp_error(sbi))) + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; goto stop; + } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed)) { + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { gc_type = FG_GC; - if (__get_victim(sbi, &segno, gc_type) || prefree_segments(sbi)) - write_checkpoint(sbi, &cpc); + /* + * If there is no victim and no prefree segment but still not + * enough free sections, we should flush dent/node blocks and do + * garbage collections. + */ + if (__get_victim(sbi, &segno, gc_type) || + prefree_segments(sbi)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + segno = NULL_SEGNO; + } else if (has_not_enough_free_secs(sbi, 0, 0)) { + ret = write_checkpoint(sbi, &cpc); + if (ret) + goto stop; + } } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) goto stop; ret = 0; - /* readahead multi ssa blocks those have contiguous address */ - if (sbi->segs_per_sec > 1) - ra_meta_pages(sbi, GET_SUM_BLOCK(sbi, segno), sbi->segs_per_sec, - META_SSA, true); - - for (i = 0; i < sbi->segs_per_sec; i++) { - /* - * for FG_GC case, halt gcing left segments once failed one - * of segments in selected section to avoid long latency. - */ - if (!do_garbage_collect(sbi, segno + i, &gc_list, gc_type) && - gc_type == FG_GC) - break; - } - - if (i == sbi->segs_per_sec && gc_type == FG_GC) + if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && + gc_type == FG_GC) sec_freed++; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) goto gc_more; if (gc_type == FG_GC) - write_checkpoint(sbi, &cpc); + ret = write_checkpoint(sbi, &cpc); } stop: mutex_unlock(&sbi->gc_mutex); diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index b4a65be9f7d3..a993967dcdb9 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -100,11 +100,3 @@ static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) return true; return false; } - -static inline int is_idle(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - struct request_list *rl = &q->root_rl; - return !(rl->count[BLK_RW_SYNC]) && !(rl->count[BLK_RW_ASYNC]); -} diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index b238d2fec3e5..71b7206c431e 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,8 +70,7 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, - struct f2fs_filename *fname) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -80,10 +79,6 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, const unsigned char *name = name_info->name; size_t len = name_info->len; - /* encrypted bigname case */ - if (fname && !fname->disk_name.name) - return cpu_to_le32(fname->hash); - if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index f35f3eb3541f..b150d03beec1 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -17,9 +17,6 @@ bool f2fs_may_inline_data(struct inode *inode) { - if (!test_opt(F2FS_I_SB(inode), INLINE_DATA)) - return false; - if (f2fs_is_atomic_file(inode)) return false; @@ -55,7 +52,7 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); /* Copy the whole inline data block */ src_addr = inline_data_addr(ipage); @@ -63,7 +60,8 @@ void read_inline_data(struct page *page, struct page *ipage) memcpy(dst_addr, src_addr, MAX_INLINE_DATA); flush_dcache_page(page); kunmap_atomic(dst_addr); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); } bool truncate_inline_inode(struct page *ipage, u64 from) @@ -75,9 +73,9 @@ bool truncate_inline_inode(struct page *ipage, u64 from) addr = inline_data_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memset(addr + from, 0, MAX_INLINE_DATA - from); - + set_page_dirty(ipage); return true; } @@ -112,11 +110,12 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) } if (page->index) - zero_user_segment(page, 0, PAGE_CACHE_SIZE); + zero_user_segment(page, 0, PAGE_SIZE); else read_inline_data(page, ipage); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); f2fs_put_page(ipage, 1); trace_android_fs_dataread_end(inode, page_offset(page), PAGE_SIZE); @@ -126,7 +125,6 @@ int f2fs_read_inline_data(struct inode *inode, struct page *page) int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) { - void *src_addr, *dst_addr; struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, @@ -136,8 +134,6 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) }; int dirty, err; - f2fs_bug_on(F2FS_I_SB(dn->inode), page->index); - if (!f2fs_exist_data(dn->inode)) goto clear_out; @@ -145,21 +141,9 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) if (err) return err; - f2fs_wait_on_page_writeback(page, DATA); + f2fs_bug_on(F2FS_P_SB(page), PageWriteback(page)); - if (PageUptodate(page)) - goto no_update; - - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); - - /* Copy the whole inline data block */ - src_addr = inline_data_addr(dn->inode_page); - dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - flush_dcache_page(page); - kunmap_atomic(dst_addr); - SetPageUptodate(page); -no_update: + read_inline_data(page, dn->inode_page); set_page_dirty(page); /* clear dirty state */ @@ -167,23 +151,21 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); - fio.blk_addr = dn->data_blkaddr; + fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); - f2fs_wait_on_page_writeback(page, DATA); + f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) inode_dec_dirty_pages(dn->inode); /* this converted inline_data should be recovered. */ - set_inode_flag(F2FS_I(dn->inode), FI_APPEND_WRITE); + set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ truncate_inline_inode(dn->inode_page, 0); + clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); f2fs_clear_inline_inode(dn->inode); - sync_inode_page(dn); f2fs_put_dnode(dn); return 0; } @@ -195,7 +177,10 @@ int f2fs_convert_inline_inode(struct inode *inode) struct page *ipage, *page; int err = 0; - page = grab_cache_page(inode->i_mapping, 0); + if (!f2fs_has_inline_data(inode)) + return 0; + + page = f2fs_grab_cache_page(inode->i_mapping, 0, false); if (!page) return -ENOMEM; @@ -217,6 +202,9 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_unlock_op(sbi); f2fs_put_page(page, 1); + + f2fs_balance_fs(sbi, dn.node_changed); + return err; } @@ -238,16 +226,17 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_bug_on(F2FS_I_SB(inode), page->index); - f2fs_wait_on_page_writeback(dn.inode_page, NODE); + f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); dst_addr = inline_data_addr(dn.inode_page); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); kunmap_atomic(src_addr); + set_page_dirty(dn.inode_page); - set_inode_flag(F2FS_I(inode), FI_APPEND_WRITE); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_APPEND_WRITE); + set_inode_flag(inode, FI_DATA_EXIST); - sync_inode_page(&dn); + clear_inline_node(dn.inode_page); f2fs_put_dnode(&dn); return 0; } @@ -276,16 +265,16 @@ bool recover_inline_data(struct inode *inode, struct page *npage) ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); src_addr = inline_data_addr(npage); dst_addr = inline_data_addr(ipage); memcpy(dst_addr, src_addr, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); + set_inode_flag(inode, FI_INLINE_DATA); + set_inode_flag(inode, FI_DATA_EXIST); - update_inode(inode, ipage); + set_page_dirty(ipage); f2fs_put_page(ipage, 1); return true; } @@ -296,7 +285,6 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (!truncate_inline_inode(ipage, 0)) return false; f2fs_clear_inline_inode(inode); - update_inode(inode, ipage); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -307,7 +295,7 @@ bool recover_inline_data(struct inode *inode, struct page *npage) } struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, - struct f2fs_filename *fname, struct page **res_page) + struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_inline_dentry *inline_dentry; @@ -318,10 +306,12 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + if (IS_ERR(ipage)) { + *res_page = ipage; return NULL; + } - namehash = f2fs_dentry_hash(&name, fname); + namehash = f2fs_dentry_hash(&name); inline_dentry = inline_data_addr(ipage); @@ -333,30 +323,6 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, else f2fs_put_page(ipage, 0); - /* - * For the most part, it should be a bug when name_len is zero. - * We stop here for figuring out where the bugs has occurred. - */ - f2fs_bug_on(sbi, d.max < 0); - return de; -} - -struct f2fs_dir_entry *f2fs_parent_inline_dir(struct inode *dir, - struct page **p) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; - struct f2fs_dir_entry *de; - struct f2fs_inline_dentry *dentry_blk; - - ipage = get_node_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return NULL; - - dentry_blk = inline_data_addr(ipage); - de = &dentry_blk->dentry[1]; - *p = ipage; - unlock_page(ipage); return de; } @@ -374,10 +340,8 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) { - i_size_write(inode, MAX_INLINE_DATA); - set_inode_flag(F2FS_I(inode), FI_UPDATE_DIR); - } + if (i_size_read(inode) < MAX_INLINE_DATA) + f2fs_i_size_write(inode, MAX_INLINE_DATA); return 0; } @@ -385,7 +349,7 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, struct f2fs_inline_dentry *inline_dentry) { struct page *page; @@ -393,7 +357,7 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, struct f2fs_dentry_block *dentry_blk; int err; - page = grab_cache_page(dir->i_mapping, 0); + page = f2fs_grab_cache_page(dir->i_mapping, 0, false); if (!page) { f2fs_put_page(ipage, 1); return -ENOMEM; @@ -404,8 +368,8 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, if (err) goto out; - f2fs_wait_on_page_writeback(page, DATA); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_CACHE_SIZE); + f2fs_wait_on_page_writeback(page, DATA, true); + zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); dentry_blk = kmap_atomic(page); @@ -426,37 +390,132 @@ static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, NR_INLINE_DENTRY * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); set_page_dirty(page); /* clear inline dir and flag after data writeback */ truncate_inline_inode(ipage, 0); stat_dec_inline_dir(dir); - clear_inode_flag(F2FS_I(dir), FI_INLINE_DENTRY); + clear_inode_flag(dir, FI_INLINE_DENTRY); - if (i_size_read(dir) < PAGE_CACHE_SIZE) { - i_size_write(dir, PAGE_CACHE_SIZE); - set_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } - - sync_inode_page(&dn); + f2fs_i_depth_write(dir, 1); + if (i_size_read(dir) < PAGE_SIZE) + f2fs_i_size_write(dir, PAGE_SIZE); out: f2fs_put_page(page, 1); return err; } -int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, - struct inode *inode, nid_t ino, umode_t mode) +static int f2fs_add_inline_entries(struct inode *dir, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_dentry_ptr d; + unsigned long bit_pos = 0; + int err = 0; + + make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + + while (bit_pos < d.max) { + struct f2fs_dir_entry *de; + struct qstr new_name; + nid_t ino; + umode_t fake_mode; + + if (!test_bit_le(bit_pos, d.bitmap)) { + bit_pos++; + continue; + } + + de = &d.dentry[bit_pos]; + + if (unlikely(!de->name_len)) { + bit_pos++; + continue; + } + + new_name.name = d.filename[bit_pos]; + new_name.len = de->name_len; + + ino = le32_to_cpu(de->ino); + fake_mode = get_de_type(de) << S_SHIFT; + + err = f2fs_add_regular_entry(dir, &new_name, NULL, NULL, + ino, fake_mode); + if (err) + goto punch_dentry_pages; + + bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); + } + return 0; +punch_dentry_pages: + truncate_inode_pages(&dir->i_data, 0); + truncate_blocks(dir, 0, false); + remove_dirty_inode(dir); + return err; +} + +static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + struct f2fs_inline_dentry *backup_dentry; + int err; + + backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), + sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + if (!backup_dentry) { + f2fs_put_page(ipage, 1); + return -ENOMEM; + } + + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); + truncate_inline_inode(ipage, 0); + + unlock_page(ipage); + + err = f2fs_add_inline_entries(dir, backup_dentry); + if (err) + goto recover; + + lock_page(ipage); + + stat_dec_inline_dir(dir); + clear_inode_flag(dir, FI_INLINE_DENTRY); + kfree(backup_dentry); + return 0; +recover: + lock_page(ipage); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + f2fs_i_depth_write(dir, 0); + f2fs_i_size_write(dir, MAX_INLINE_DATA); + set_page_dirty(ipage); + f2fs_put_page(ipage, 1); + + kfree(backup_dentry); + return err; +} + +static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, + struct f2fs_inline_dentry *inline_dentry) +{ + if (!F2FS_I(dir)->i_dir_level) + return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + else + return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); +} + +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - size_t namelen = name->len; struct f2fs_inline_dentry *dentry_blk = NULL; struct f2fs_dentry_ptr d; - int slots = GET_DENTRY_SLOTS(namelen); + int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; int err = 0; @@ -477,25 +536,27 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, if (inode) { down_write(&F2FS_I(inode)->i_sem); - page = init_inode_metadata(inode, dir, name, ipage); + page = init_inode_metadata(inode, dir, new_name, + orig_name, ipage); if (IS_ERR(page)) { err = PTR_ERR(page); goto fail; } + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); } - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(name, NULL); + name_hash = f2fs_dentry_hash(new_name); make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); - f2fs_update_dentry(ino, mode, &d, name, name_hash, bit_pos); + f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); /* we don't need to mark_inode_dirty now */ if (inode) { - F2FS_I(inode)->i_pino = dir->i_ino; - update_inode(inode, page); + f2fs_i_pino_write(inode, dir->i_ino); f2fs_put_page(page, 1); } @@ -503,11 +564,6 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *name, fail: if (inode) up_write(&F2FS_I(inode)->i_sem); - - if (is_inode_flag_set(F2FS_I(dir), FI_UPDATE_DIR)) { - update_inode(dir, ipage); - clear_inode_flag(F2FS_I(dir), FI_UPDATE_DIR); - } out: f2fs_put_page(ipage, 1); return err; @@ -522,22 +578,22 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, int i; lock_page(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); inline_dentry = inline_data_addr(page); bit_pos = dentry - inline_dentry->dentry; for (i = 0; i < slots; i++) - test_and_clear_bit_le(bit_pos + i, + __clear_bit_le(bit_pos + i, &inline_dentry->dentry_bitmap); set_page_dirty(page); + f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = CURRENT_TIME; + f2fs_mark_inode_dirty_sync(dir); if (inode) - f2fs_drop_nlink(dir, inode, page); - - f2fs_put_page(page, 1); + f2fs_drop_nlink(dir, inode); } bool f2fs_empty_inline_dir(struct inode *dir) @@ -565,7 +621,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) } int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, - struct f2fs_str *fstr) + struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); struct f2fs_inline_dentry *inline_dentry = NULL; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 97e20decacb4..d7369895a78a 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -11,6 +11,7 @@ #include #include #include +#include #include #include "f2fs.h" @@ -18,6 +19,13 @@ #include +void f2fs_mark_inode_dirty_sync(struct inode *inode) +{ + if (f2fs_inode_dirtied(inode)) + return; + mark_inode_dirty_sync(inode); +} + void f2fs_set_inode_flags(struct inode *inode) { unsigned int flags = F2FS_I(inode)->i_flags; @@ -35,6 +43,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); + f2fs_mark_inode_dirty_sync(inode); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) @@ -83,10 +92,10 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); - set_inode_flag(F2FS_I(inode), FI_DATA_EXIST); - set_raw_inline(F2FS_I(inode), F2FS_INODE(ipage)); + set_inode_flag(inode, FI_DATA_EXIST); + set_raw_inline(inode, F2FS_INODE(ipage)); set_page_dirty(ipage); return; } @@ -138,9 +147,10 @@ static int do_read_inode(struct inode *inode) fi->i_pino = le32_to_cpu(ri->i_pino); fi->i_dir_level = ri->i_dir_level; - f2fs_init_extent_tree(inode, &ri->i_ext); + if (f2fs_init_extent_tree(inode, &ri->i_ext)) + set_page_dirty(node_page); - get_inline_info(fi, ri); + get_inline_info(inode, ri); /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) @@ -150,7 +160,10 @@ static int do_read_inode(struct inode *inode) __get_inode_rdev(inode, ri); if (__written_first_block(ri)) - set_inode_flag(F2FS_I(inode), FI_FIRST_BLOCK_WRITTEN); + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + + if (!need_inode_block_update(sbi, inode->i_ino)) + fi->last_disk_size = inode->i_size; f2fs_put_page(node_page, 1); @@ -202,6 +215,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -221,11 +235,27 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) return ERR_PTR(ret); } -void update_inode(struct inode *inode, struct page *node_page) +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) +{ + struct inode *inode; +retry: + inode = f2fs_iget(sb, ino); + if (IS_ERR(inode)) { + if (PTR_ERR(inode) == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } + } + return inode; +} + +int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; - f2fs_wait_on_page_writeback(node_page, NODE); + f2fs_inode_synced(inode); + + f2fs_wait_on_page_writeback(node_page, NODE, true); ri = F2FS_INODE(node_page); @@ -242,7 +272,7 @@ void update_inode(struct inode *inode, struct page *node_page) &ri->i_ext); else memset(&ri->i_ext, 0, sizeof(ri->i_ext)); - set_raw_inline(F2FS_I(inode), ri); + set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); ri->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec); @@ -259,15 +289,19 @@ void update_inode(struct inode *inode, struct page *node_page) __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); - set_page_dirty(node_page); - clear_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + /* deleted inode */ + if (inode->i_nlink == 0) + clear_inline_node(node_page); + + return set_page_dirty(node_page); } -void update_inode_page(struct inode *inode) +int update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *node_page; + int ret = 0; retry: node_page = get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { @@ -276,12 +310,14 @@ void update_inode_page(struct inode *inode) cond_resched(); goto retry; } else if (err != -ENOENT) { - f2fs_stop_checkpoint(sbi); + f2fs_stop_checkpoint(sbi, false); } - return; + f2fs_inode_synced(inode); + return 0; } - update_inode(inode, node_page); + ret = update_inode(inode, node_page); f2fs_put_page(node_page, 1); + return ret; } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) @@ -292,16 +328,15 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) inode->i_ino == F2FS_META_INO(sbi)) return 0; - if (!is_inode_flag_set(F2FS_I(inode), FI_DIRTY_INODE)) + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; /* * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - update_inode_page(inode); - - f2fs_balance_fs(sbi); + if (update_inode_page(inode)) + f2fs_balance_fs(sbi, true); return 0; } @@ -311,13 +346,12 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) void f2fs_evict_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct f2fs_inode_info *fi = F2FS_I(inode); - nid_t xnid = fi->i_xattr_nid; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; int err = 0; /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); trace_f2fs_evict_inode(inode); truncate_inode_pages_final(&inode->i_data); @@ -327,19 +361,24 @@ void f2fs_evict_inode(struct inode *inode) goto out_clear; f2fs_bug_on(sbi, get_dirty_pages(inode)); - remove_dirty_dir_inode(inode); + remove_dirty_inode(inode); f2fs_destroy_extent_tree(inode); if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; - sb_start_intwrite(inode->i_sb); - set_inode_flag(fi, FI_NO_ALLOC); - i_size_write(inode, 0); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) + goto no_delete; +#endif + sb_start_intwrite(inode->i_sb); + set_inode_flag(inode, FI_NO_ALLOC); + i_size_write(inode, 0); +retry: if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, true); + err = f2fs_truncate(inode); if (!err) { f2fs_lock_op(sbi); @@ -347,6 +386,14 @@ void f2fs_evict_inode(struct inode *inode) f2fs_unlock_op(sbi); } + /* give more chances, if ENOMEM case */ + if (err == -ENOMEM) { + err = 0; + goto retry; + } + + if (err) + update_inode_page(inode); sb_end_intwrite(inode->i_sb); no_delete: stat_dec_inline_xattr(inode); @@ -356,36 +403,18 @@ void f2fs_evict_inode(struct inode *inode) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(fi, FI_APPEND_WRITE)) - add_dirty_inode(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(fi, FI_UPDATE_WRITE)) - add_dirty_inode(sbi, inode->i_ino, UPDATE_INO); - if (is_inode_flag_set(fi, FI_FREE_NID)) { - if (err && err != -ENOENT) - alloc_nid_done(sbi, inode->i_ino); - else - alloc_nid_failed(sbi, inode->i_ino); - clear_inode_flag(fi, FI_FREE_NID); - } - - if (err && err != -ENOENT) { - if (!exist_written_data(sbi, inode->i_ino, ORPHAN_INO)) { - /* - * get here because we failed to release resource - * of inode previously, reminder our user to run fsck - * for fixing. - */ - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_msg(sbi->sb, KERN_WARNING, - "inode (ino:%lu) resource leak, run fsck " - "to fix this issue!", inode->i_ino); - } + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (is_inode_flag_set(inode, FI_FREE_NID)) { + alloc_nid_failed(sbi, inode->i_ino); + clear_inode_flag(inode, FI_FREE_NID); } + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (fi->i_crypt_info) - f2fs_free_encryption_info(inode, fi->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); } @@ -393,37 +422,32 @@ void f2fs_evict_inode(struct inode *inode) void handle_failed_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - int err = 0; + struct node_info ni; - clear_nlink(inode); - make_bad_inode(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); - i_size_write(inode, 0); - if (F2FS_HAS_BLOCKS(inode)) - err = f2fs_truncate(inode, false); - - if (!err) - err = remove_inode_page(inode); - /* - * if we skip truncate_node in remove_inode_page bacause we failed - * before, it's better to find another way to release resource of - * this inode (e.g. valid block count, node block or nid). Here we - * choose to add this inode to orphan list, so that we can call iput - * for releasing in orphan recovery flow. - * * Note: we should add inode to orphan list before f2fs_unlock_op() * so we can prevent losing this orphan when encoutering checkpoint * and following suddenly power-off. */ - if (err && err != -ENOENT) { - err = acquire_orphan_inode(sbi); - if (!err) - add_orphan_inode(sbi, inode->i_ino); + get_node_info(sbi, inode->i_ino, &ni); + + if (ni.blk_addr != NULL_ADDR) { + int err = acquire_orphan_inode(sbi); + if (err) { + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_msg(sbi->sb, KERN_WARNING, + "Too many orphan inodes, run fsck to fix."); + } else { + add_orphan_inode(inode); + } + alloc_nid_done(sbi, inode->i_ino); + } else { + set_inode_flag(inode, FI_FREE_NID); } - set_inode_flag(F2FS_I(inode), FI_FREE_NID); f2fs_unlock_op(sbi); /* iput will drop the inode object */ diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 2c32110f9fc0..0f071a70522d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -60,10 +60,14 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); - if (f2fs_may_inline_data(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DATA); + set_inode_flag(inode, FI_NEW_INODE); + + if (test_opt(sbi, INLINE_XATTR)) + set_inode_flag(inode, FI_INLINE_XATTR); + if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) + set_inode_flag(inode, FI_INLINE_DATA); if (f2fs_may_inline_dentry(inode)) - set_inode_flag(F2FS_I(inode), FI_INLINE_DENTRY); + set_inode_flag(inode, FI_INLINE_DENTRY); f2fs_init_extent_tree(inode, NULL); @@ -72,14 +76,13 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_dir(inode); trace_f2fs_new_inode(inode, 0); - mark_inode_dirty(inode); return inode; fail: trace_f2fs_new_inode(inode, err); make_bad_inode(inode); if (nid_free) - set_inode_flag(F2FS_I(inode), FI_FREE_NID); + set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); } @@ -88,18 +91,23 @@ static int is_multimedia_file(const unsigned char *s, const char *sub) { size_t slen = strlen(s); size_t sublen = strlen(sub); + int i; /* * filename format of multimedia file should be defined as: - * "filename + '.' + extension". + * "filename + '.' + extension + (optional: '.' + temp extension)". */ if (slen < sublen + 2) return 0; - if (s[slen - sublen - 1] != '.') - return 0; + for (i = 1; i < slen - sublen; i++) { + if (s[i] != '.') + continue; + if (!strncasecmp(s + i + 1, sub, sublen)) + return 1; + } - return !strncasecmp(s + slen - sublen, sub, sublen); + return 0; } /* @@ -128,8 +136,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -142,6 +148,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -169,15 +177,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, int err; if (f2fs_encrypted_inode(dir) && - !f2fs_is_child_context_consistent_with_parent(dir, inode)) + !fscrypt_has_permitted_context(dir, inode)) return -EPERM; - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); inode->i_ctime = CURRENT_TIME; ihold(inode); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -190,7 +198,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); return 0; out: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); iput(inode); f2fs_unlock_op(sbi); return err; @@ -199,10 +207,14 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *f2fs_get_parent(struct dentry *child) { struct qstr dotdot = QSTR_INIT("..", 2); - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot); - if (!ino) + struct page *page; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + if (!ino) { + if (IS_ERR(page)) + return ERR_CAST(page); return ERR_PTR(-ENOENT); - return d_obtain_alias(f2fs_iget(d_inode(child)->i_sb, ino)); + } + return d_obtain_alias(f2fs_iget(child->d_sb, ino)); } static int __recover_dot_dentries(struct inode *dir, nid_t pino) @@ -214,12 +226,24 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) struct page *page; int err = 0; + if (f2fs_readonly(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_INFO, + "skip recovering inline_dots inode (ino:%lu, pino:%u) " + "in readonly mountpoint", dir->i_ino, pino); + return 0; + } + + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); de = f2fs_find_entry(dir, &dot, &page); if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; } else { err = __f2fs_add_link(dir, &dot, NULL, dir->i_ino, S_IFDIR); if (err) @@ -230,14 +254,14 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) if (de) { f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { err = __f2fs_add_link(dir, &dotdot, NULL, pino, S_IFDIR); } out: - if (!err) { - clear_inode_flag(F2FS_I(dir), FI_INLINE_DOTS); - mark_inode_dirty(dir); - } + if (!err) + clear_inode_flag(dir, FI_INLINE_DOTS); f2fs_unlock_op(sbi); return err; @@ -251,13 +275,32 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, struct page *page; nid_t ino; int err = 0; + unsigned int root_ino = F2FS_ROOT_INO(F2FS_I_SB(dir)); + + if (f2fs_encrypted_inode(dir)) { + int res = fscrypt_get_encryption_info(dir); + + /* + * DCACHE_ENCRYPTED_WITH_KEY is set if the dentry is + * created while the directory was encrypted and we + * don't have access to the key. + */ + if (fscrypt_has_encryption_key(dir)) + fscrypt_set_encrypted_dentry(dentry); + fscrypt_set_d_op(dentry); + if (res && res != -ENOKEY) + return ERR_PTR(res); + } if (dentry->d_name.len > F2FS_NAME_LEN) return ERR_PTR(-ENAMETOOLONG); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + return (struct dentry *)page; return d_splice_alias(inode, dentry); + } ino = le32_to_cpu(de->ino); f2fs_dentry_kunmap(dir, page); @@ -267,15 +310,29 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (IS_ERR(inode)) return ERR_CAST(inode); + if ((dir->i_ino == root_ino) && f2fs_has_inline_dots(dir)) { + err = __recover_dot_dentries(dir, root_ino); + if (err) + goto err_out; + } + if (f2fs_has_inline_dots(inode)) { err = __recover_dot_dentries(inode, dir->i_ino); if (err) goto err_out; } + if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { + bool nokey = f2fs_encrypted_inode(inode) && + !fscrypt_has_encryption_key(inode); + err = nokey ? -ENOKEY : -EPERM; + goto err_out; + } return d_splice_alias(inode, dentry); err_out: - iget_failed(inode); + iput(inode); return ERR_PTR(err); } @@ -288,11 +345,15 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) int err = -ENOENT; trace_f2fs_unlink_enter(dir, dentry); - f2fs_balance_fs(sbi); de = f2fs_find_entry(dir, &dentry->d_name, &page); - if (!de) + if (!de) { + if (IS_ERR(page)) + err = PTR_ERR(page); goto fail; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); @@ -305,9 +366,6 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) f2fs_delete_entry(de, page, dir, inode); f2fs_unlock_op(sbi); - /* In order to evict this inode, we set it dirty */ - mark_inode_dirty(inode); - if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); fail: @@ -332,16 +390,24 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; size_t len = strlen(symname); - size_t p_len; - char *p_str; - struct f2fs_str disk_link = FSTR_INIT(NULL, 0); - struct f2fs_encrypted_symlink_data *sd = NULL; + struct fscrypt_str disk_link = FSTR_INIT((char *)symname, len + 1); + struct fscrypt_symlink_data *sd = NULL; int err; - if (len > dir->i_sb->s_blocksize) - return -ENAMETOOLONG; + if (f2fs_encrypted_inode(dir)) { + err = fscrypt_get_encryption_info(dir); + if (err) + return err; - f2fs_balance_fs(sbi); + if (!fscrypt_has_encryption_key(dir)) + return -EPERM; + + disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + + sizeof(struct fscrypt_symlink_data)); + } + + if (disk_link.len > dir->i_sb->s_blocksize) + return -ENAMETOOLONG; inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) @@ -351,8 +417,11 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode->i_op = &f2fs_encrypted_symlink_inode_operations; else inode->i_op = &f2fs_symlink_inode_operations; + inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -360,42 +429,36 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, f2fs_unlock_op(sbi); alloc_nid_done(sbi, inode->i_ino); - if (f2fs_encrypted_inode(dir)) { + if (f2fs_encrypted_inode(inode)) { struct qstr istr = QSTR_INIT(symname, len); + struct fscrypt_str ostr; - err = f2fs_get_encryption_info(inode); - if (err) - goto err_out; - - err = f2fs_fname_crypto_alloc_buffer(inode, len, &disk_link); - if (err) - goto err_out; - - err = f2fs_fname_usr_to_disk(inode, &istr, &disk_link); - if (err < 0) - goto err_out; - - p_len = encrypted_symlink_data_len(disk_link.len) + 1; - - if (p_len > dir->i_sb->s_blocksize) { - err = -ENAMETOOLONG; - goto err_out; - } - - sd = kzalloc(p_len, GFP_NOFS); + sd = kzalloc(disk_link.len, GFP_NOFS); if (!sd) { err = -ENOMEM; goto err_out; } - memcpy(sd->encrypted_path, disk_link.name, disk_link.len); - sd->len = cpu_to_le16(disk_link.len); - p_str = (char *)sd; - } else { - p_len = len + 1; - p_str = (char *)symname; + + err = fscrypt_get_encryption_info(inode); + if (err) + goto err_out; + + if (!fscrypt_has_encryption_key(inode)) { + err = -EPERM; + goto err_out; + } + + ostr.name = sd->encrypted_path; + ostr.len = disk_link.len; + err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); + if (err < 0) + goto err_out; + + sd->len = cpu_to_le16(ostr.len); + disk_link.name = (char *)sd; } - err = page_symlink(inode, p_str, p_len); + err = page_symlink(inode, disk_link.name, disk_link.len); err_out: d_instantiate(dentry, inode); @@ -411,7 +474,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, * performance regression. */ if (!err) { - filemap_write_and_wait_range(inode->i_mapping, 0, p_len - 1); + filemap_write_and_wait_range(inode->i_mapping, 0, + disk_link.len - 1); if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); @@ -420,7 +484,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, } kfree(sd); - f2fs_fname_crypto_free_buffer(&disk_link); return err; out: handle_failed_inode(inode); @@ -433,8 +496,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -444,7 +505,9 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - set_inode_flag(F2FS_I(inode), FI_INC_LINK); + f2fs_balance_fs(sbi, true); + + set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -461,7 +524,7 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) return 0; out_fail: - clear_inode_flag(F2FS_I(inode), FI_INC_LINK); + clear_inode_flag(inode, FI_INC_LINK); handle_failed_inode(inode); return err; } @@ -481,8 +544,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -490,6 +551,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -516,9 +579,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; - if (!whiteout) - f2fs_balance_fs(sbi); - inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -532,6 +592,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -545,17 +607,17 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, * add this non-linked tmpfile to orphan list, in this way we could * remove all unused data of tmpfile after abnormal power-off. */ - add_orphan_inode(sbi, inode->i_ino); - f2fs_unlock_op(sbi); - + add_orphan_inode(inode); alloc_nid_done(sbi, inode->i_ino); if (whiteout) { - inode_dec_link_count(inode); + f2fs_i_links_write(inode, false); *whiteout = inode; } else { d_tmpfile(dentry, inode); } + /* link_count was changed by d_tmpfile as well. */ + f2fs_unlock_op(sbi); unlock_new_inode(inode); return 0; @@ -569,7 +631,7 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) { if (f2fs_encrypted_inode(dir)) { - int err = f2fs_get_encryption_info(dir); + int err = fscrypt_get_encryption_info(dir); if (err) return err; } @@ -595,26 +657,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; + bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && - !f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode)) { + !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; goto out; } - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_old; + } } if (flags & RENAME_WHITEOUT) { @@ -632,8 +697,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_whiteout; + } + + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -641,8 +711,9 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - if (update_dent_inode(old_inode, new_inode, - &new_dentry->d_name)) { + err = update_dent_inode(old_inode, new_inode, + &new_dentry->d_name); + if (err) { release_orphan_inode(sbi); goto put_out_dir; } @@ -652,20 +723,17 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, new_inode->i_ctime = CURRENT_TIME; down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) - drop_nlink(new_inode); - drop_nlink(new_inode); + f2fs_i_links_write(new_inode, false); + f2fs_i_links_write(new_inode, false); up_write(&F2FS_I(new_inode)->i_sem); - mark_inode_dirty(new_inode); - if (!new_inode->i_nlink) - add_orphan_inode(sbi, new_inode->i_ino); + add_orphan_inode(new_inode); else release_orphan_inode(sbi); - - update_inode_page(old_inode); - update_inode_page(new_inode); } else { + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = f2fs_add_link(new_dentry, old_inode); @@ -674,9 +742,29 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_whiteout; } - if (old_dir_entry) { - inc_nlink(new_dir); - update_inode_page(new_dir); + if (old_dir_entry) + f2fs_i_links_write(new_dir, true); + + /* + * old entry and new entry can locate in the same inline + * dentry in inode, when attaching new entry in inline dentry, + * it could force inline dentry conversion, after that, + * old_entry and old_page will point to wrong address, in + * order to avoid this, let's do the check and update here. + */ + if (is_old_inline && !f2fs_has_inline_dentry(old_dir)) { + f2fs_put_page(old_page, 0); + old_page = NULL; + + old_entry = f2fs_find_entry(old_dir, + &old_dentry->d_name, &old_page); + if (!old_entry) { + err = -ENOENT; + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); + f2fs_unlock_op(sbi); + goto out_whiteout; + } } } @@ -687,13 +775,13 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = CURRENT_TIME; - mark_inode_dirty(old_inode); + f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); if (whiteout) { whiteout->i_state |= I_LINKABLE; - set_inode_flag(F2FS_I(whiteout), FI_INC_LINK); + set_inode_flag(whiteout, FI_INC_LINK); err = f2fs_add_link(old_dentry, whiteout); if (err) goto put_out_dir; @@ -705,14 +793,11 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir && !whiteout) { f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); - update_inode_page(old_inode); } else { f2fs_dentry_kunmap(old_inode, old_dir_page); f2fs_put_page(old_dir_page, 0); } - drop_nlink(old_dir); - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_i_links_write(old_dir, false); } f2fs_unlock_op(sbi); @@ -756,39 +841,45 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int err = -ENOENT; if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && - (old_dir != new_dir) && - (!f2fs_is_child_context_consistent_with_parent(new_dir, - old_inode) || - !f2fs_is_child_context_consistent_with_parent(old_dir, - new_inode))) + (old_dir != new_dir) && + (!fscrypt_has_permitted_context(new_dir, old_inode) || + !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; - f2fs_balance_fs(sbi); - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); - if (!old_entry) + if (!old_entry) { + if (IS_ERR(old_page)) + err = PTR_ERR(old_page); goto out; + } new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); - if (!new_entry) + if (!new_entry) { + if (IS_ERR(new_page)) + err = PTR_ERR(new_page); goto out_old; + } /* prepare for updating ".." directory entry info later */ if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); - if (!old_dir_entry) + if (!old_dir_entry) { + if (IS_ERR(old_dir_page)) + err = PTR_ERR(old_dir_page); goto out_new; + } } if (S_ISDIR(new_inode->i_mode)) { - err = -EIO; new_dir_entry = f2fs_parent_dir(new_inode, &new_dir_page); - if (!new_dir_entry) + if (!new_dir_entry) { + if (IS_ERR(new_dir_page)) + err = PTR_ERR(new_dir_page); goto out_old_dir; + } } } @@ -807,6 +898,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_new_dir; } + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); @@ -836,19 +929,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - update_inode_page(old_inode); - old_dir->i_ctime = CURRENT_TIME; if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); - if (old_nlink < 0) - drop_nlink(old_dir); - else - inc_nlink(old_dir); + f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - mark_inode_dirty(old_dir); - update_inode_page(old_dir); + f2fs_mark_inode_dirty_sync(old_dir); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -857,19 +944,13 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - update_inode_page(new_inode); - new_dir->i_ctime = CURRENT_TIME; if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); - if (new_nlink < 0) - drop_nlink(new_dir); - else - inc_nlink(new_dir); + f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - mark_inode_dirty(new_dir); - update_inode_page(new_dir); + f2fs_mark_inode_dirty_sync(new_dir); f2fs_unlock_op(sbi); @@ -922,89 +1003,85 @@ static int f2fs_rename2(struct inode *old_dir, struct dentry *old_dentry, return f2fs_rename(old_dir, old_dentry, new_dir, new_dentry, flags); } -#ifdef CONFIG_F2FS_FS_ENCRYPTION static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cookie) { struct page *cpage = NULL; char *caddr, *paddr = NULL; - struct f2fs_str cstr; - struct f2fs_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_str cstr = FSTR_INIT(NULL, 0); + struct fscrypt_str pstr = FSTR_INIT(NULL, 0); + struct fscrypt_symlink_data *sd; struct inode *inode = d_inode(dentry); - struct f2fs_encrypted_symlink_data *sd; - loff_t size = min_t(loff_t, i_size_read(inode), PAGE_SIZE - 1); u32 max_size = inode->i_sb->s_blocksize; int res; - res = f2fs_get_encryption_info(inode); + if (!dentry) + return ERR_PTR(-ECHILD); + + res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); cpage = read_mapping_page(inode->i_mapping, 0, NULL); if (IS_ERR(cpage)) return ERR_CAST(cpage); - caddr = kmap(cpage); - caddr[size] = 0; + caddr = page_address(cpage); /* Symlink is encrypted */ - sd = (struct f2fs_encrypted_symlink_data *)caddr; + sd = (struct fscrypt_symlink_data *)caddr; + cstr.name = sd->encrypted_path; cstr.len = le16_to_cpu(sd->len); - cstr.name = kmalloc(cstr.len, GFP_NOFS); - if (!cstr.name) { - res = -ENOMEM; - goto errout; - } - memcpy(cstr.name, sd->encrypted_path, cstr.len); /* this is broken symlink case */ - if (cstr.name[0] == 0 && cstr.len == 0) { + if (unlikely(cstr.len == 0)) { res = -ENOENT; goto errout; } - if ((cstr.len + sizeof(struct f2fs_encrypted_symlink_data) - 1) > - max_size) { + if ((cstr.len + sizeof(struct fscrypt_symlink_data) - 1) > max_size) { /* Symlink data on the disk is corrupted */ res = -EIO; goto errout; } - res = f2fs_fname_crypto_alloc_buffer(inode, cstr.len, &pstr); + res = fscrypt_fname_alloc_buffer(inode, cstr.len, &pstr); if (res) goto errout; - res = f2fs_fname_disk_to_usr(inode, NULL, &cstr, &pstr); + res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); if (res < 0) goto errout; - kfree(cstr.name); + /* this is broken symlink case */ + if (unlikely(pstr.name[0] == 0)) { + res = -ENOENT; + goto errout; + } paddr = pstr.name; /* Null-terminate the name */ paddr[res] = '\0'; - kunmap(cpage); - page_cache_release(cpage); + put_page(cpage); return *cookie = paddr; errout: - kfree(cstr.name); - f2fs_fname_crypto_free_buffer(&pstr); - kunmap(cpage); - page_cache_release(cpage); + fscrypt_fname_free_buffer(&pstr); + put_page(cpage); return ERR_PTR(res); } const struct inode_operations f2fs_encrypted_symlink_inode_operations = { .readlink = generic_readlink, - .follow_link = f2fs_encrypted_follow_link, - .put_link = kfree_put_link, + .follow_link = f2fs_encrypted_follow_link, + .put_link = kfree_put_link, .getattr = f2fs_getattr, .setattr = f2fs_setattr, +#ifdef CONFIG_F2FS_FS_XATTR .setxattr = generic_setxattr, .getxattr = generic_getxattr, .listxattr = f2fs_listxattr, .removexattr = generic_removexattr, -}; #endif +}; const struct inode_operations f2fs_dir_inode_operations = { .create = f2fs_create, diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 7bcbc6e9c40d..b1e615ed2bef 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -46,12 +46,14 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) */ if (type == FREE_NIDS) { mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> - PAGE_CACHE_SHIFT; + PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); + if (excess_cached_nats(sbi)) + res = false; } else if (type == DIRTY_DENTS) { if (sbi->sb->s_bdi->wb.dirty_exceeded) return false; @@ -62,16 +64,17 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) for (i = 0; i <= UPDATE_INO; i++) mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_CACHE_SHIFT; + sizeof(struct ino_entry)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { - mem_size = (sbi->total_ext_tree * sizeof(struct extent_tree) + + mem_size = (atomic_read(&sbi->total_ext_tree) * + sizeof(struct extent_tree) + atomic_read(&sbi->total_ext_node) * - sizeof(struct extent_node)) >> PAGE_CACHE_SHIFT; + sizeof(struct extent_node)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else { - if (sbi->sb->s_bdi->wb.dirty_exceeded) - return false; + if (!sbi->sb->s_bdi->wb.dirty_exceeded) + return true; } return res; } @@ -120,7 +123,7 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -256,18 +259,21 @@ static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) return new; } -static void cache_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, +static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_nat_entry *ne) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct nat_entry *e; - down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, nid); if (!e) { e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); + } else { + f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || + nat_get_blkaddr(e) != ne->block_addr || + nat_get_version(e) != ne->version); } - up_write(&nm_i->nat_tree_lock); } static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, @@ -355,7 +361,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = START_NID(nid); struct f2fs_nat_block *nat_blk; struct page *page = NULL; @@ -372,21 +378,20 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) ni->ino = nat_get_ino(e); ni->blk_addr = nat_get_blkaddr(e); ni->version = nat_get_version(e); - } - up_read(&nm_i->nat_tree_lock); - if (e) + up_read(&nm_i->nat_tree_lock); return; + } memset(&ne, 0, sizeof(struct f2fs_nat_entry)); /* Check current segment summary */ - mutex_lock(&curseg->curseg_mutex); - i = lookup_journal_in_cursum(sum, NAT_JOURNAL, nid, 0); + down_read(&curseg->journal_rwsem); + i = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 0); if (i >= 0) { - ne = nat_in_journal(sum, i); + ne = nat_in_journal(journal, i); node_info_from_raw_nat(ni, &ne); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); if (i >= 0) goto cache; @@ -397,18 +402,75 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: + up_read(&nm_i->nat_tree_lock); /* cache nat entry */ - cache_nat_entry(NM_I(sbi), nid, &ne); + down_write(&nm_i->nat_tree_lock); + cache_nat_entry(sbi, nid, &ne); + up_write(&nm_i->nat_tree_lock); +} + +/* + * readahead MAX_RA_NODE number of node pages. + */ +static void ra_node_pages(struct page *parent, int start, int n) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct blk_plug plug; + int i, end; + nid_t nid; + + blk_start_plug(&plug); + + /* Then, try readahead for siblings of the desired node */ + end = start + n; + end = min(end, NIDS_PER_BLOCK); + for (i = start; i < end; i++) { + nid = get_nid(parent, i, false); + ra_node_page(sbi, nid); + } + + blk_finish_plug(&plug); +} + +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs) +{ + const long direct_index = ADDRS_PER_INODE(dn->inode); + const long direct_blks = ADDRS_PER_BLOCK; + const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; + unsigned int skipped_unit = ADDRS_PER_BLOCK; + int cur_level = dn->cur_level; + int max_level = dn->max_level; + pgoff_t base = 0; + + if (!dn->max_level) + return pgofs + 1; + + while (max_level-- > cur_level) + skipped_unit *= NIDS_PER_BLOCK; + + switch (dn->max_level) { + case 3: + base += 2 * indirect_blks; + case 2: + base += 2 * direct_blks; + case 1: + base += direct_index; + break; + default: + f2fs_bug_on(F2FS_I_SB(dn->inode), 1); + } + + return ((pgofs - base) / skipped_unit + 1) * skipped_unit + base; } /* * The maximum depth is four. * Offset[0] will have raw inode offset. */ -static int get_node_path(struct f2fs_inode_info *fi, long block, +static int get_node_path(struct inode *inode, long block, int offset[4], unsigned int noffset[4]) { - const long direct_index = ADDRS_PER_INODE(fi); + const long direct_index = ADDRS_PER_INODE(inode); const long direct_blks = ADDRS_PER_BLOCK; const long dptrs_per_blk = NIDS_PER_BLOCK; const long indirect_blks = ADDRS_PER_BLOCK * NIDS_PER_BLOCK; @@ -493,10 +555,10 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int offset[4]; unsigned int noffset[4]; nid_t nids[4]; - int level, i; + int level, i = 0; int err = 0; - level = get_node_path(F2FS_I(dn->inode), index, offset, noffset); + level = get_node_path(dn->inode, index, offset, noffset); nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -583,6 +645,11 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) release_out: dn->inode_page = NULL; dn->node_page = NULL; + if (err == -ENOENT) { + dn->cur_level = i; + dn->max_level = level; + dn->ofs_in_node = offset[level]; + } return err; } @@ -606,8 +673,7 @@ static void truncate_node(struct dnode_of_data *dn) if (dn->nid == dn->inode->i_ino) { remove_orphan_inode(sbi, dn->nid); dec_valid_inode_count(sbi); - } else { - sync_inode_page(dn); + f2fs_inode_synced(dn->inode); } invalidate: clear_node_page_dirty(dn->node_page); @@ -666,6 +732,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, return PTR_ERR(page); } + ra_node_pages(page, ofs, NIDS_PER_BLOCK); + rn = F2FS_NODE(page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { @@ -676,7 +744,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; } } else { child_nofs = nofs + ofs * (NIDS_PER_BLOCK + 1) + 1; @@ -689,7 +758,8 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - set_nid(page, i, 0, false); + if (set_nid(page, i, 0, false)) + dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { goto out_err; @@ -741,6 +811,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, nid[i + 1] = get_nid(pages[i], offset[i + 1], false); } + ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { child_nid = get_nid(pages[idx], i, false); @@ -750,7 +822,8 @@ static int truncate_partial_nodes(struct dnode_of_data *dn, err = truncate_dnode(dn); if (err < 0) goto fail; - set_nid(pages[idx], i, 0, false); + if (set_nid(pages[idx], i, 0, false)) + dn->node_changed = true; } if (offset[idx + 1] == 0) { @@ -787,8 +860,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); - level = get_node_path(F2FS_I(inode), from, offset, noffset); -restart: + level = get_node_path(inode, from, offset, noffset); + page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); @@ -852,11 +925,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) if (offset[1] == 0 && ri->i_nid[offset[0] - NODE_DIR1_BLOCK]) { lock_page(page); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto restart; - } - f2fs_wait_on_page_writeback(page, NODE); + BUG_ON(page->mapping != NODE_MAPPING(sbi)); + f2fs_wait_on_page_writeback(page, NODE, true); ri->i_nid[offset[0] - NODE_DIR1_BLOCK] = 0; set_page_dirty(page); unlock_page(page); @@ -885,7 +955,7 @@ int truncate_xattr_node(struct inode *inode, struct page *page) if (IS_ERR(npage)) return PTR_ERR(npage); - F2FS_I(inode)->i_xattr_nid = 0; + f2fs_i_xnid_write(inode, 0); /* need to do checkpoint during fsync */ F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); @@ -951,10 +1021,10 @@ struct page *new_node_page(struct dnode_of_data *dn, struct page *page; int err; - if (unlikely(is_inode_flag_set(F2FS_I(dn->inode), FI_NO_ALLOC))) + if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = grab_cache_page(NODE_MAPPING(sbi), dn->nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -971,23 +1041,19 @@ struct page *new_node_page(struct dnode_of_data *dn, new_ni.ino = dn->inode->i_ino; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); set_cold_node(dn->inode, page); - SetPageUptodate(page); - set_page_dirty(page); + if (!PageUptodate(page)) + SetPageUptodate(page); + if (set_page_dirty(page)) + dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) - F2FS_I(dn->inode)->i_xattr_nid = dn->nid; + f2fs_i_xnid_write(dn->inode, dn->nid); - dn->node_page = page; - if (ipage) - update_inode(dn->inode, ipage); - else - sync_inode_page(dn); if (ofs == 0) inc_valid_inode_count(sbi); - return page; fail: @@ -1013,6 +1079,9 @@ static int read_node_page(struct page *page, int rw) .encrypted_page = NULL, }; + if (PageUptodate(page)) + return LOCKED_PAGE; + get_node_info(sbi, page->index, &ni); if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1020,10 +1089,7 @@ static int read_node_page(struct page *page, int rw) return -ENOENT; } - if (PageUptodate(page)) - return LOCKED_PAGE; - - fio.blk_addr = ni.blk_addr; + fio.new_blkaddr = fio.old_blkaddr = ni.blk_addr; return f2fs_submit_page_bio(&fio); } @@ -1035,14 +1101,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) struct page *apage; int err; - apage = find_get_page(NODE_MAPPING(sbi), nid); - if (apage && PageUptodate(apage)) { - f2fs_put_page(apage, 0); + if (!nid) return; - } - f2fs_put_page(apage, 0); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); - apage = grab_cache_page(NODE_MAPPING(sbi), nid); + rcu_read_lock(); + apage = radix_tree_lookup(&NODE_MAPPING(sbi)->page_tree, nid); + rcu_read_unlock(); + if (apage) + return; + + apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!apage) return; @@ -1050,53 +1119,17 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start) { struct page *page; int err; -repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); - if (!page) - return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); - if (err < 0) { - f2fs_put_page(page, 1); - return ERR_PTR(err); - } else if (err != LOCKED_PAGE) { - lock_page(page); - } - - if (unlikely(!PageUptodate(page) || nid != nid_of_node(page))) { - ClearPageUptodate(page); - f2fs_put_page(page, 1); - return ERR_PTR(-EIO); - } - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); - goto repeat; - } - return page; -} - -/* - * Return a locked page for the desired node page. - * And, readahead MAX_RA_NODE number of node pages. - */ -struct page *get_node_page_ra(struct page *parent, int start) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - struct blk_plug plug; - struct page *page; - int err, i, end; - nid_t nid; - - /* First, try getting the desired direct node. */ - nid = get_nid(parent, start, false); if (!nid) return ERR_PTR(-ENOENT); + f2fs_bug_on(sbi, check_nid_range(sbi, nid)); repeat: - page = grab_cache_page(NODE_MAPPING(sbi), nid); + page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); if (!page) return ERR_PTR(-ENOMEM); @@ -1108,61 +1141,116 @@ struct page *get_node_page_ra(struct page *parent, int start) goto page_hit; } - blk_start_plug(&plug); - - /* Then, try readahead for siblings of the desired node */ - end = start + MAX_RA_NODE; - end = min(end, NIDS_PER_BLOCK); - for (i = start + 1; i < end; i++) { - nid = get_nid(parent, i, false); - if (!nid) - continue; - ra_node_page(sbi, nid); - } - - blk_finish_plug(&plug); + if (parent) + ra_node_pages(parent, start + 1, MAX_RA_NODE); lock_page(page); + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { f2fs_put_page(page, 1); goto repeat; } + + if (unlikely(!PageUptodate(page))) + goto out_err; page_hit: - if (unlikely(!PageUptodate(page))) { + if(unlikely(nid != nid_of_node(page))) { + f2fs_bug_on(sbi, 1); + ClearPageUptodate(page); +out_err: f2fs_put_page(page, 1); return ERR_PTR(-EIO); } return page; } -void sync_inode_page(struct dnode_of_data *dn) +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - if (IS_INODE(dn->node_page) || dn->inode_page == dn->node_page) { - update_inode(dn->inode, dn->node_page); - } else if (dn->inode_page) { - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - update_inode(dn->inode, dn->inode_page); - if (!dn->inode_page_locked) - unlock_page(dn->inode_page); - } else { - update_inode_page(dn->inode); - } + return __get_node_page(sbi, nid, NULL, 0); } -int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, - struct writeback_control *wbc) +struct page *get_node_page_ra(struct page *parent, int start) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + nid_t nid = get_nid(parent, start, false); + + return __get_node_page(sbi, nid, parent, start); +} + +static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) +{ + struct inode *inode; + struct page *page; + int ret; + + /* should flush inline_data before evict_inode */ + inode = ilookup(sbi->sb, ino); + if (!inode) + return; + + page = pagecache_get_page(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); + if (!page) + goto iput_out; + + if (!PageUptodate(page)) + goto page_out; + + if (!PageDirty(page)) + goto page_out; + + if (!clear_page_dirty_for_io(page)) + goto page_out; + + ret = f2fs_write_inline_data(inode, page); + inode_dec_dirty_pages(inode); + if (ret) + set_page_dirty(page); +page_out: + f2fs_put_page(page, 1); +iput_out: + iput(inode); +} + +void move_node_page(struct page *node_page, int gc_type) +{ + if (gc_type == FG_GC) { + struct f2fs_sb_info *sbi = F2FS_P_SB(node_page); + struct writeback_control wbc = { + .sync_mode = WB_SYNC_ALL, + .nr_to_write = 1, + .for_reclaim = 0, + }; + + set_page_dirty(node_page); + f2fs_wait_on_page_writeback(node_page, NODE, true); + + f2fs_bug_on(sbi, PageWriteback(node_page)); + if (!clear_page_dirty_for_io(node_page)) + goto out_page; + + if (NODE_MAPPING(sbi)->a_ops->writepage(node_page, &wbc)) + unlock_page(node_page); + goto release_page; + } else { + /* set page dirty and write it */ + if (!PageWriteback(node_page)) + set_page_dirty(node_page); + } +out_page: + unlock_page(node_page); +release_page: + f2fs_put_page(node_page, 0); +} + +static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index, end; struct pagevec pvec; - int step = ino ? 2 : 0; - int nwritten = 0, wrote = 0; + struct page *last_page = NULL; pagevec_init(&pvec, 0); - -next_step: index = 0; - end = LONG_MAX; + end = ULONG_MAX; while (index <= end) { int i, nr_pages; @@ -1175,6 +1263,190 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return ERR_PTR(-EIO); + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page)) { + /* someone wrote it for us */ + goto continue_unlock; + } + + if (last_page) + f2fs_put_page(last_page, 0); + + get_page(page); + last_page = page; + unlock_page(page); + } + pagevec_release(&pvec); + cond_resched(); + } + return last_page; +} + +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic) +{ + pgoff_t index, end; + struct pagevec pvec; + int ret = 0; + struct page *last_page = NULL; + bool marked = false; + nid_t ino = inode->i_ino; + int nwritten = 0; + + if (atomic) { + last_page = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_page)) + return PTR_ERR_OR_ZERO(last_page); + } +retry: + pagevec_init(&pvec, 0); + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_put_page(last_page, 0); + pagevec_release(&pvec); + return -EIO; + } + + if (!IS_DNODE(page) || !is_cold_node(page)) + continue; + if (ino_of_node(page) != ino) + continue; + + lock_page(page); + + if (unlikely(page->mapping != NODE_MAPPING(sbi))) { +continue_unlock: + unlock_page(page); + continue; + } + if (ino_of_node(page) != ino) + goto continue_unlock; + + if (!PageDirty(page) && page != last_page) { + /* someone wrote it for us */ + goto continue_unlock; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + BUG_ON(PageWriteback(page)); + + if (!atomic || page == last_page) { + set_fsync_mark(page, 1); + if (IS_INODE(page)) { + if (is_inode_flag_set(inode, + FI_DIRTY_INODE)) + update_inode(inode, page); + set_dentry_mark(page, + need_dentry_mark(sbi, ino)); + } + /* may be written by other thread */ + if (!PageDirty(page)) + set_page_dirty(page); + } + + if (!clear_page_dirty_for_io(page)) + goto continue_unlock; + + ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + if (ret) { + unlock_page(page); + f2fs_put_page(last_page, 0); + break; + } else { + nwritten++; + } + + if (page == last_page) { + f2fs_put_page(page, 0); + marked = true; + break; + } + } + pagevec_release(&pvec); + cond_resched(); + + if (ret || marked) + break; + } + if (!ret && atomic && !marked) { + f2fs_msg(sbi->sb, KERN_DEBUG, + "Retry to write fsync mark: ino=%u, idx=%lx", + ino, last_page->index); + lock_page(last_page); + set_page_dirty(last_page); + unlock_page(last_page); + goto retry; + } + + if (nwritten) + f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + return ret ? -EIO: 0; +} + +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +{ + pgoff_t index, end; + struct pagevec pvec; + int step = 0; + int nwritten = 0; + int ret = 0; + + pagevec_init(&pvec, 0); + +next_step: + index = 0; + end = ULONG_MAX; + + while (index <= end) { + int i, nr_pages; + nr_pages = pagevec_lookup_tag(&pvec, NODE_MAPPING(sbi), &index, + PAGECACHE_TAG_DIRTY, + min(end - index, (pgoff_t)PAGEVEC_SIZE-1) + 1); + if (nr_pages == 0) + break; + + for (i = 0; i < nr_pages; i++) { + struct page *page = pvec.pages[i]; + + if (unlikely(f2fs_cp_error(sbi))) { + pagevec_release(&pvec); + ret = -EIO; + goto out; + } + /* * flushing sequence with step: * 0. indirect nodes @@ -1189,14 +1461,8 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, if (step == 2 && (!IS_DNODE(page) || !is_cold_node(page))) continue; - - /* - * If an fsync mode, - * we should not skip writing node pages. - */ - if (ino && ino_of_node(page) == ino) - lock_page(page); - else if (!trylock_page(page)) +lock_node: + if (!trylock_page(page)) continue; if (unlikely(page->mapping != NODE_MAPPING(sbi))) { @@ -1204,33 +1470,33 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, unlock_page(page); continue; } - if (ino && ino_of_node(page) != ino) - goto continue_unlock; if (!PageDirty(page)) { /* someone wrote it for us */ goto continue_unlock; } + /* flush inline_data */ + if (is_inline_node(page)) { + clear_inline_node(page); + unlock_page(page); + flush_inline_data(sbi, ino_of_node(page)); + goto lock_node; + } + + f2fs_wait_on_page_writeback(page, NODE, true); + + BUG_ON(PageWriteback(page)); if (!clear_page_dirty_for_io(page)) goto continue_unlock; - /* called by fsync() */ - if (ino && IS_DNODE(page)) { - set_fsync_mark(page, 1); - if (IS_INODE(page)) - set_dentry_mark(page, - need_dentry_mark(sbi, ino)); - nwritten++; - } else { - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); - } + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) unlock_page(page); else - wrote++; + nwritten++; if (--wbc->nr_to_write == 0) break; @@ -1248,15 +1514,15 @@ int sync_node_pages(struct f2fs_sb_info *sbi, nid_t ino, step++; goto next_step; } - - if (wrote) +out: + if (nwritten) f2fs_submit_merged_bio(sbi, NODE, WRITE); - return nwritten; + return ret; } int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) { - pgoff_t index = 0, end = LONG_MAX; + pgoff_t index = 0, end = ULONG_MAX; struct pagevec pvec; int ret2 = 0, ret = 0; @@ -1278,7 +1544,7 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) continue; if (ino && ino_of_node(page) == ino) { - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); if (TestClearPageError(page)) ret = -EIO; } @@ -1317,8 +1583,6 @@ static int f2fs_write_node_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - f2fs_wait_on_page_writeback(page, NODE); - /* get old block addr of this node page */ nid = nid_of_node(page); f2fs_bug_on(sbi, page->index != nid); @@ -1342,14 +1606,18 @@ static int f2fs_write_node_page(struct page *page, } set_page_writeback(page); - fio.blk_addr = ni.blk_addr; + fio.old_blkaddr = ni.blk_addr; write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.blk_addr, is_fsync_dnode(page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); dec_page_count(sbi, F2FS_DIRTY_NODES); up_read(&sbi->node_write); - unlock_page(page); if (wbc->for_reclaim) + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_bio(sbi, NODE, WRITE); return 0; @@ -1363,10 +1631,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); + struct blk_plug plug; long diff; - trace_f2fs_writepages(mapping->host, wbc, NODE); - /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1374,14 +1641,19 @@ static int f2fs_write_node_pages(struct address_space *mapping, if (get_pages(sbi, F2FS_DIRTY_NODES) < nr_pages_to_skip(sbi, NODE)) goto skip_write; + trace_f2fs_writepages(mapping->host, wbc, NODE); + diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; - sync_node_pages(sbi, 0, wbc); + blk_start_plug(&plug); + sync_node_pages(sbi, wbc); + blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; skip_write: wbc->pages_skipped += get_pages(sbi, F2FS_DIRTY_NODES); + trace_f2fs_writepages(mapping->host, wbc, NODE); return 0; } @@ -1389,9 +1661,10 @@ static int f2fs_set_node_page_dirty(struct page *page) { trace_f2fs_set_page_dirty(page, NODE); - SetPageUptodate(page); + if (!PageUptodate(page)) + SetPageUptodate(page); if (!PageDirty(page)) { - __set_page_dirty_nobuffers(page); + f2fs_set_page_dirty_nobuffers(page); inc_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); SetPagePrivate(page); f2fs_trace_pid(page); @@ -1409,6 +1682,9 @@ const struct address_space_operations f2fs_node_aops = { .set_page_dirty = f2fs_set_node_page_dirty, .invalidatepage = f2fs_invalidate_page, .releasepage = f2fs_release_page, +#ifdef CONFIG_MIGRATION + .migratepage = f2fs_migrate_page, +#endif }; static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, @@ -1429,7 +1705,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; - bool allocated = false; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1440,14 +1715,9 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) if (build) { /* do not add allocated nids */ - down_read(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); - if (ne && - (!get_nat_flag(ne, IS_CHECKPOINTED) || + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || nat_get_blkaddr(ne) != NULL_ADDR)) - allocated = true; - up_read(&nm_i->nat_tree_lock); - if (allocated) return 0; } @@ -1516,22 +1786,24 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -static void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i = 0; nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt > NAT_ENTRY_PER_BLOCK) + if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); + down_read(&nm_i->nat_tree_lock); + while (1) { struct page *page = get_current_nat_page(sbi, nid); @@ -1550,16 +1822,19 @@ static void build_free_nids(struct f2fs_sb_info *sbi) nm_i->next_scan_nid = nid; /* find free nids from current sum_pages */ - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { - block_t addr = le32_to_cpu(nat_in_journal(sum, i).block_addr); - nid = le32_to_cpu(nid_in_journal(sum, i)); + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else remove_free_nid(nm_i, nid); } - mutex_unlock(&curseg->curseg_mutex); + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nm_i->next_scan_nid), nm_i->ra_nid_pages, META_NAT, false); @@ -1575,6 +1850,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i = NULL; retry: +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_ALLOC_NID)) + return false; +#endif if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; @@ -1582,8 +1861,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) /* We should not use stale free nids created by build_free_nids */ if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - struct node_info ni; - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); list_for_each_entry(i, &nm_i->free_nid_list, list) if (i->state == NID_NEW) @@ -1594,13 +1871,6 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) i->state = NID_ALLOC; nm_i->fcnt--; spin_unlock(&nm_i->free_nid_list_lock); - - /* check nid is allocated already */ - get_node_info(sbi, *nid, &ni); - if (ni.blk_addr != NULL_ADDR) { - alloc_nid_done(sbi, *nid); - goto retry; - } return true; } spin_unlock(&nm_i->free_nid_list_lock); @@ -1663,12 +1933,15 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; + if (nm_i->fcnt <= MAX_FREE_NIDS) + return 0; + if (!mutex_trylock(&nm_i->build_lock)) return 0; spin_lock(&nm_i->free_nid_list_lock); list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= NAT_ENTRY_PER_BLOCK) + if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) break; if (i->state == NID_ALLOC) continue; @@ -1695,7 +1968,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) ri = F2FS_INODE(page); if (!(ri->i_inline & F2FS_INLINE_XATTR)) { - clear_inode_flag(F2FS_I(inode), FI_INLINE_XATTR); + clear_inode_flag(inode, FI_INLINE_XATTR); goto update_inode; } @@ -1703,7 +1976,7 @@ void recover_inline_xattr(struct inode *inode, struct page *page) src_addr = inline_xattr_addr(page); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); memcpy(dst_addr, src_addr, inline_size); update_inode: update_inode(inode, ipage); @@ -1737,13 +2010,11 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); - F2FS_I(inode)->i_xattr_nid = new_xnid; + f2fs_i_xnid_write(inode, new_xnid); /* 3: update xattr blkaddr */ refresh_sit_entry(sbi, NEW_ADDR, blkaddr); set_node_addr(sbi, &ni, blkaddr, false); - - update_inode_page(inode); } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1757,15 +2028,18 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; - - ipage = grab_cache_page(NODE_MAPPING(sbi), ino); - if (!ipage) - return -ENOMEM; +retry: + ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); + if (!ipage) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry; + } /* Should not use this inode from free nid list */ remove_free_nid(NM_I(sbi), ino); - SetPageUptodate(ipage); + if (!PageUptodate(ipage)) + SetPageUptodate(ipage); fill_node_footer(ipage, ino, ino, 0, true); src = F2FS_INODE(page); @@ -1831,28 +2105,26 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < nats_in_cursum(sum); i++) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { struct nat_entry *ne; struct f2fs_nat_entry raw_ne; - nid_t nid = le32_to_cpu(nid_in_journal(sum, i)); + nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); - raw_ne = nat_in_journal(sum, i); + raw_ne = nat_in_journal(journal, i); - down_write(&nm_i->nat_tree_lock); ne = __lookup_nat_cache(nm_i, nid); if (!ne) { ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } __set_nat_cache_dirty(nm_i, ne); - up_write(&nm_i->nat_tree_lock); } - update_nats_in_cursum(sum, -i); - mutex_unlock(&curseg->curseg_mutex); + update_nats_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } static void __adjust_nat_entry_set(struct nat_entry_set *nes, @@ -1877,24 +2149,23 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, struct nat_entry_set *set) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; nid_t start_nid = set->set * NAT_ENTRY_PER_BLOCK; bool to_journal = true; struct f2fs_nat_block *nat_blk; struct nat_entry *ne, *cur; struct page *page = NULL; - struct f2fs_nm_info *nm_i = NM_I(sbi); /* * there are two steps to flush nat entries: * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(sum, set->entry_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { - mutex_lock(&curseg->curseg_mutex); + down_write(&curseg->journal_rwsem); } else { page = get_next_nat_page(sbi, start_nid); nat_blk = page_address(page); @@ -1911,35 +2182,29 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, continue; if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, NAT_JOURNAL, nid, 1); f2fs_bug_on(sbi, offset < 0); - raw_ne = &nat_in_journal(sum, offset); - nid_in_journal(sum, offset) = cpu_to_le32(nid); + raw_ne = &nat_in_journal(journal, offset); + nid_in_journal(journal, offset) = cpu_to_le32(nid); } else { raw_ne = &nat_blk->entries[nid - start_nid]; } raw_nat_from_node_info(raw_ne, &ne->ni); - - down_write(&NM_I(sbi)->nat_tree_lock); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - up_write(&NM_I(sbi)->nat_tree_lock); - if (nat_get_blkaddr(ne) == NULL_ADDR) add_free_nid(sbi, nid, false); } if (to_journal) - mutex_unlock(&curseg->curseg_mutex); + up_write(&curseg->journal_rwsem); else f2fs_put_page(page, 1); f2fs_bug_on(sbi, set->entry_cnt); - down_write(&nm_i->nat_tree_lock); radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - up_write(&nm_i->nat_tree_lock); kmem_cache_free(nat_entry_set_slab, set); } @@ -1950,7 +2215,7 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct nat_entry_set *setvec[SETVEC_SIZE]; struct nat_entry_set *set, *tmp; unsigned int found; @@ -1959,29 +2224,32 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) if (!nm_i->dirty_nat_cnt) return; + + down_write(&nm_i->nat_tree_lock); + /* * if there are no enough space in journal to store dirty nat * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(sum, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); - down_write(&nm_i->nat_tree_lock); while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, - MAX_NAT_JENTRIES(sum)); + MAX_NAT_JENTRIES(journal)); } - up_write(&nm_i->nat_tree_lock); /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) __flush_nat_entry_set(sbi, set); + up_write(&nm_i->nat_tree_lock); + f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); } @@ -2006,6 +2274,7 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; + nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); INIT_LIST_HEAD(&nm_i->free_nid_list); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e4fffd2d98c4..868bec65e51c 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -15,15 +15,21 @@ #define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ -#define FREE_NID_PAGES 4 +#define FREE_NID_PAGES 8 +#define MAX_FREE_NIDS (NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES) -#define DEF_RA_NID_PAGES 4 /* # of nid pages to be readaheaded */ +#define DEF_RA_NID_PAGES 0 /* # of nid pages to be readaheaded */ /* maximum readahead size for node during getting data blocks */ #define MAX_RA_NODE 128 /* control the memory footprint threshold (10MB per 1GB ram) */ -#define DEF_RAM_THRESHOLD 10 +#define DEF_RAM_THRESHOLD 1 + +/* control dirty nats ratio threshold (default: 10% over max nid count) */ +#define DEF_DIRTY_NAT_RATIO_THRESHOLD 10 +/* control total # of nats */ +#define DEF_NAT_CACHE_THRESHOLD 100000 /* vector size for gang look-up from nat cache that consists of radix tree */ #define NATVEC_SIZE 64 @@ -117,6 +123,17 @@ static inline void raw_nat_from_node_info(struct f2fs_nat_entry *raw_ne, raw_ne->version = ni->version; } +static inline bool excess_dirty_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->dirty_nat_cnt >= NM_I(sbi)->max_nid * + NM_I(sbi)->dirty_nats_ratio / 100; +} + +static inline bool excess_cached_nats(struct f2fs_sb_info *sbi) +{ + return NM_I(sbi)->nat_cnt >= DEF_NAT_CACHE_THRESHOLD; +} + enum mem_type { FREE_NIDS, /* indicates the free nid list */ NAT_ENTRIES, /* indicates the cached nat entry */ @@ -183,7 +200,7 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) block_addr = (pgoff_t)(nm_i->nat_blkaddr + (seg_off << sbi->log_blocks_per_seg << 1) + - (block_off & ((1 << sbi->log_blocks_per_seg) - 1))); + (block_off & (sbi->blocks_per_seg - 1))); if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -212,6 +229,37 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) f2fs_change_bit(block_off, nm_i->nat_bitmap); } +static inline nid_t ino_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.ino); +} + +static inline nid_t nid_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.nid); +} + +static inline unsigned int ofs_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + unsigned flag = le32_to_cpu(rn->footer.flag); + return flag >> OFFSET_BIT_SHIFT; +} + +static inline __u64 cpver_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le64_to_cpu(rn->footer.cp_ver); +} + +static inline block_t next_blkaddr_of_node(struct page *node_page) +{ + struct f2fs_node *rn = F2FS_NODE(node_page); + return le32_to_cpu(rn->footer.next_blkaddr); +} + static inline void fill_node_footer(struct page *page, nid_t nid, nid_t ino, unsigned int ofs, bool reset) { @@ -242,40 +290,30 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); - rn->footer.cp_ver = ckpt->checkpoint_ver; + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } -static inline nid_t ino_of_node(struct page *node_page) +static inline bool is_recoverable_dnode(struct page *page) { - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.ino); -} + struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); + size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); + __u64 cp_ver = cur_cp_version(ckpt); -static inline nid_t nid_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.nid); -} - -static inline unsigned int ofs_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - unsigned flag = le32_to_cpu(rn->footer.flag); - return flag >> OFFSET_BIT_SHIFT; -} - -static inline unsigned long long cpver_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le64_to_cpu(rn->footer.cp_ver); -} - -static inline block_t next_blkaddr_of_node(struct page *node_page) -{ - struct f2fs_node *rn = F2FS_NODE(node_page); - return le32_to_cpu(rn->footer.next_blkaddr); + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { + __u64 crc = le32_to_cpu(*((__le32 *) + ((unsigned char *)ckpt + crc_offset))); + cp_ver |= (crc << 32); + } + return cpu_to_le64(cp_ver) == cpver_of_node(page); } /* @@ -317,17 +355,17 @@ static inline bool IS_DNODE(struct page *node_page) return true; } -static inline void set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct page *p, int off, nid_t nid, bool i) { struct f2fs_node *rn = F2FS_NODE(p); - f2fs_wait_on_page_writeback(p, NODE); + f2fs_wait_on_page_writeback(p, NODE, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - set_page_dirty(p); + return set_page_dirty(p); } static inline nid_t get_nid(struct page *p, int off, bool i) @@ -370,6 +408,21 @@ static inline int is_node(struct page *page, int type) #define is_fsync_dnode(page) is_node(page, FSYNC_BIT_SHIFT) #define is_dent_dnode(page) is_node(page, DENT_BIT_SHIFT) +static inline int is_inline_node(struct page *page) +{ + return PageChecked(page); +} + +static inline void set_inline_node(struct page *page) +{ + SetPageChecked(page); +} + +static inline void clear_inline_node(struct page *page) +{ + ClearPageChecked(page); +} + static inline void set_cold_node(struct inode *inode, struct page *page) { struct f2fs_node *rn = F2FS_NODE(page); diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index e32f349f341b..ba7372f7f95d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -49,8 +49,9 @@ static struct kmem_cache *fsync_entry_slab; bool space_for_roll_forward(struct f2fs_sb_info *sbi) { - if (sbi->last_valid_block_count + sbi->alloc_valid_block_count - > sbi->user_block_count) + s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count); + + if (sbi->last_valid_block_count + nalloc > sbi->user_block_count) return false; return true; } @@ -67,42 +68,71 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, return NULL; } -static int recover_dentry(struct inode *inode, struct page *ipage) +static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, + struct list_head *head, nid_t ino) +{ + struct inode *inode; + struct fsync_inode_entry *entry; + + inode = f2fs_iget_retry(sbi->sb, ino); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); + entry->inode = inode; + list_add_tail(&entry->list, head); + + return entry; +} + +static void del_fsync_inode(struct fsync_inode_entry *entry) +{ + iput(entry->inode); + list_del(&entry->list); + kmem_cache_free(fsync_entry_slab, entry); +} + +static int recover_dentry(struct inode *inode, struct page *ipage, + struct list_head *dir_list) { struct f2fs_inode *raw_inode = F2FS_INODE(ipage); nid_t pino = le32_to_cpu(raw_inode->i_pino); struct f2fs_dir_entry *de; - struct qstr name; + struct fscrypt_name fname; struct page *page; struct inode *dir, *einode; + struct fsync_inode_entry *entry; int err = 0; + char *name; - dir = f2fs_iget(inode->i_sb, pino); - if (IS_ERR(dir)) { - err = PTR_ERR(dir); - goto out; + entry = get_fsync_inode(dir_list, pino); + if (!entry) { + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + if (IS_ERR(entry)) { + dir = ERR_CAST(entry); + err = PTR_ERR(entry); + goto out; + } } - if (file_enc_name(inode)) { - iput(dir); - return 0; - } + dir = entry->inode; - name.len = le32_to_cpu(raw_inode->i_namelen); - name.name = raw_inode->i_name; + memset(&fname, 0, sizeof(struct fscrypt_name)); + fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen); + fname.disk_name.name = raw_inode->i_name; - if (unlikely(name.len > F2FS_NAME_LEN)) { + if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) { WARN_ON(1); err = -ENAMETOOLONG; - goto out_err; + goto out; } retry: - de = f2fs_find_entry(dir, &name, &page); + de = __f2fs_find_entry(dir, &fname, &page); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_unmap_put; if (de) { - einode = f2fs_iget(inode->i_sb, le32_to_cpu(de->ino)); + einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino)); if (IS_ERR(einode)) { WARN_ON(1); err = PTR_ERR(einode); @@ -118,29 +148,27 @@ static int recover_dentry(struct inode *inode, struct page *ipage) f2fs_delete_entry(de, page, dir, einode); iput(einode); goto retry; - } - err = __f2fs_add_link(dir, &name, inode, inode->i_ino, inode->i_mode); - if (err) - goto out_err; - - if (is_inode_flag_set(F2FS_I(dir), FI_DELAY_IPUT)) { - iput(dir); + } else if (IS_ERR(page)) { + err = PTR_ERR(page); } else { - add_dirty_dir_inode(dir); - set_inode_flag(F2FS_I(dir), FI_DELAY_IPUT); + err = __f2fs_do_add_link(dir, &fname, inode, + inode->i_ino, inode->i_mode); } - + if (err == -ENOMEM) + goto retry; goto out; out_unmap_put: f2fs_dentry_kunmap(dir, page); f2fs_put_page(page, 0); -out_err: - iput(dir); out: + if (file_enc_name(inode)) + name = ""; + else + name = raw_inode->i_name; f2fs_msg(inode->i_sb, KERN_NOTICE, "%s: ino = %x, name = %s, dir = %lx, err = %d", - __func__, ino_of_node(ipage), raw_inode->i_name, + __func__, ino_of_node(ipage), name, IS_ERR(dir) ? 0 : dir->i_ino, err); return err; } @@ -151,7 +179,7 @@ static void recover_inode(struct inode *inode, struct page *page) char *name; inode->i_mode = le16_to_cpu(raw->i_mode); - i_size_write(inode, le64_to_cpu(raw->i_size)); + f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); @@ -168,9 +196,34 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } +static bool is_same_inode(struct inode *inode, struct page *ipage) +{ + struct f2fs_inode *ri = F2FS_INODE(ipage); + struct timespec disk; + + if (!IS_INODE(ipage)) + return true; + + disk.tv_sec = le64_to_cpu(ri->i_ctime); + disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); + if (timespec_compare(&inode->i_ctime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_atime); + disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); + if (timespec_compare(&inode->i_atime, &disk) > 0) + return false; + + disk.tv_sec = le64_to_cpu(ri->i_mtime); + disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); + if (timespec_compare(&inode->i_mtime, &disk) > 0) + return false; + + return true; +} + static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; block_t blkaddr; @@ -180,8 +233,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - ra_meta_pages(sbi, blkaddr, 1, META_POR, true); - while (1) { struct fsync_inode_entry *entry; @@ -190,49 +241,41 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) + if (!is_recoverable_dnode(page)) break; if (!is_fsync_dnode(page)) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (!entry) { + if (entry) { + if (!is_same_inode(entry->inode, page)) + goto next; + } else { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; } - /* add this fsync inode to the list */ - entry = kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); - if (!entry) { - err = -ENOMEM; - break; - } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry->inode = f2fs_iget(sbi->sb, ino_of_node(page)); - if (IS_ERR(entry->inode)) { - err = PTR_ERR(entry->inode); - kmem_cache_free(fsync_entry_slab, entry); + entry = add_fsync_inode(sbi, head, ino_of_node(page)); + if (IS_ERR(entry)) { + err = PTR_ERR(entry); if (err == -ENOENT) { err = 0; goto next; } break; } - list_add_tail(&entry->list, head); } entry->blkaddr = blkaddr; - if (IS_INODE(page)) { - entry->last_inode = blkaddr; - if (is_dent_dnode(page)) - entry->last_dentry = blkaddr; - } + if (IS_INODE(page) && is_dent_dnode(page)) + entry->last_dentry = blkaddr; next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -248,11 +291,8 @@ static void destroy_fsync_dnodes(struct list_head *head) { struct fsync_inode_entry *entry, *tmp; - list_for_each_entry_safe(entry, tmp, head, list) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + list_for_each_entry_safe(entry, tmp, head, list) + del_fsync_inode(entry); } static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, @@ -314,15 +354,14 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, if (ino != dn->inode->i_ino) { /* Deallocate previous index in the node page */ - inode = f2fs_iget(sbi->sb, ino); + inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); } else { inode = dn->inode; } - bidx = start_bidx_of_node(offset, F2FS_I(inode)) + - le16_to_cpu(sum.ofs_in_node); + bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node); /* * if inode page is locked, unlock temporarily, but its reference @@ -357,10 +396,9 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, block_t blkaddr) { - struct f2fs_inode_info *fi = F2FS_I(inode); - unsigned int start, end; struct dnode_of_data dn; struct node_info ni; + unsigned int start, end; int err = 0, recovered = 0; /* step 1: recover xattr */ @@ -380,16 +418,21 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, goto out; /* step 3: recover data indices */ - start = start_bidx_of_node(ofs_of_node(page), fi); - end = start + ADDRS_PER_PAGE(page, fi); + start = start_bidx_of_node(ofs_of_node(page), inode); + end = start + ADDRS_PER_PAGE(page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); - +retry_dn: err = get_dnode_of_data(&dn, start, ALLOC_NODE); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_dn; + } goto out; + } - f2fs_wait_on_page_writeback(dn.node_page, NODE); + f2fs_wait_on_page_writeback(dn.node_page, NODE, true); get_node_info(sbi, dn.nid, &ni); f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); @@ -411,14 +454,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } + if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + /* * dest is reserved block, invalidate src block * and then reserve one new block in dnode page. */ if (dest == NEW_ADDR) { truncate_data_blocks_range(&dn, 1); - err = reserve_new_block(&dn); - f2fs_bug_on(sbi, err); + reserve_new_block(&dn); continue; } @@ -427,25 +472,33 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (src == NULL_ADDR) { err = reserve_new_block(&dn); +#ifdef CONFIG_F2FS_FAULT_INJECTION + while (err) + err = reserve_new_block(&dn); +#endif /* We should not get -ENOSPC */ f2fs_bug_on(sbi, err); + if (err) + goto err; } - +retry_prev: /* Check the previous node page having this index */ err = check_index_in_prev_nodes(sbi, dest, &dn); - if (err) + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + goto retry_prev; + } goto err; + } /* write dummy data page */ f2fs_replace_block(sbi, &dn, src, dest, - ni.version, false); + ni.version, false, false); recovered++; } } - if (IS_INODE(dn.node_page)) - sync_inode_page(&dn); - copy_node_footer(dn.node_page, page); fill_node_footer(dn.node_page, dn.nid, ni.ino, ofs_of_node(page), false); @@ -459,17 +512,16 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, return err; } -static int recover_data(struct f2fs_sb_info *sbi, - struct list_head *head, int type) +static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, + struct list_head *dir_list) { - unsigned long long cp_ver = cur_cp_version(F2FS_CKPT(sbi)); struct curseg_info *curseg; struct page *page = NULL; int err = 0; block_t blkaddr; /* get node pages in the current segment */ - curseg = CURSEG_I(sbi, type); + curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); while (1) { @@ -482,12 +534,12 @@ static int recover_data(struct f2fs_sb_info *sbi, page = get_tmp_page(sbi, blkaddr); - if (cp_ver != cpver_of_node(page)) { + if (!is_recoverable_dnode(page)) { f2fs_put_page(page, 1); break; } - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(page)); if (!entry) goto next; /* @@ -495,10 +547,10 @@ static int recover_data(struct f2fs_sb_info *sbi, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (entry->last_inode == blkaddr) + if (IS_INODE(page)) recover_inode(entry->inode, page); if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page); + err = recover_dentry(entry->inode, page, dir_list); if (err) { f2fs_put_page(page, 1); break; @@ -510,11 +562,8 @@ static int recover_data(struct f2fs_sb_info *sbi, break; } - if (entry->blkaddr == blkaddr) { - iput(entry->inode); - list_del(&entry->list); - kmem_cache_free(fsync_entry_slab, entry); - } + if (entry->blkaddr == blkaddr) + del_fsync_inode(entry); next: /* check next segment */ blkaddr = next_blkaddr_of_node(page); @@ -525,12 +574,14 @@ static int recover_data(struct f2fs_sb_info *sbi, return err; } -int recover_fsync_data(struct f2fs_sb_info *sbi) +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; + struct list_head dir_list; block_t blkaddr; int err; + int ret = 0; bool need_writecp = false; fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", @@ -539,6 +590,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) return -ENOMEM; INIT_LIST_HEAD(&inode_list); + INIT_LIST_HEAD(&dir_list); /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); @@ -547,25 +599,26 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); - if (err) + if (err || list_empty(&inode_list)) goto out; - if (list_empty(&inode_list)) + if (check_only) { + ret = 1; goto out; + } need_writecp = true; /* step #2: recover data */ - err = recover_data(sbi, &inode_list, CURSEG_WARM_NODE); + err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); out: destroy_fsync_dnodes(&inode_list); - kmem_cache_destroy(fsync_entry_slab); /* truncate meta pages to be used by the recovery */ truncate_inode_pages_range(META_MAPPING(sbi), - (loff_t)MAIN_BLKADDR(sbi) << PAGE_CACHE_SHIFT, -1); + (loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1); if (err) { truncate_inode_pages_final(NODE_MAPPING(sbi)); @@ -573,31 +626,20 @@ int recover_fsync_data(struct f2fs_sb_info *sbi) } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) { - bool invalidate = false; + if (err) + set_ckpt_flags(sbi, CP_ERROR_FLAG); + mutex_unlock(&sbi->cp_mutex); - if (discard_next_dnode(sbi, blkaddr)) - invalidate = true; + /* let's drop all the directory inodes for clean checkpoint */ + destroy_fsync_dnodes(&dir_list); - /* Flush all the NAT/SIT pages */ - while (get_pages(sbi, F2FS_DIRTY_META)) - sync_meta_pages(sbi, META, LONG_MAX); - - /* invalidate temporary meta page */ - if (invalidate) - invalidate_mapping_pages(META_MAPPING(sbi), - blkaddr, blkaddr); - - set_ckpt_flags(sbi->ckpt, CP_ERROR_FLAG); - mutex_unlock(&sbi->cp_mutex); - } else if (need_writecp) { + if (!err && need_writecp) { struct cp_control cpc = { .reason = CP_RECOVERY, }; - mutex_unlock(&sbi->cp_mutex); - write_checkpoint(sbi, &cpc); - } else { - mutex_unlock(&sbi->cp_mutex); + err = write_checkpoint(sbi, &cpc); } - return err; + + kmem_cache_destroy(fsync_entry_slab); + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 7965957dd0e6..b3c61ae37f92 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -86,6 +86,7 @@ static inline unsigned long __reverse_ffs(unsigned long word) /* * __find_rev_next(_zero)_bit is copied from lib/find_next_bit.c because * f2fs_set_bit makes MSB and LSB reversed in a byte. + * @size must be integral times of unsigned long. * Example: * MSB <--> LSB * f2fs_set_bit(0, bitmap) => 1000 0000 @@ -95,94 +96,73 @@ static unsigned long __find_rev_next_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp &= ~0UL >> offset; + while (1) { + if (*p == 0) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG-1)) { tmp = __reverse_ulong((unsigned char *)p); + + tmp &= ~0UL >> offset; + if (size < BITS_PER_LONG) + tmp &= (~0UL << (BITS_PER_LONG - size)); if (tmp) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp &= (~0UL << (BITS_PER_LONG - size)); - if (!tmp) /* Are any bits set? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffs(tmp); + return result; +found: + return result - size + __reverse_ffs(tmp); } static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, unsigned long size, unsigned long offset) { const unsigned long *p = addr + BIT_WORD(offset); - unsigned long result = offset & ~(BITS_PER_LONG - 1); + unsigned long result = size; unsigned long tmp; if (offset >= size) return size; - size -= result; + size -= (offset & ~(BITS_PER_LONG - 1)); offset %= BITS_PER_LONG; - if (!offset) - goto aligned; - tmp = __reverse_ulong((unsigned char *)p); - tmp |= ~((~0UL << offset) >> offset); + while (1) { + if (*p == ~0UL) + goto pass; - if (size < BITS_PER_LONG) - goto found_first; - if (tmp != ~0UL) - goto found_middle; - - size -= BITS_PER_LONG; - result += BITS_PER_LONG; - p++; -aligned: - while (size & ~(BITS_PER_LONG - 1)) { tmp = __reverse_ulong((unsigned char *)p); + + if (offset) + tmp |= ~0UL << (BITS_PER_LONG - offset); + if (size < BITS_PER_LONG) + tmp |= ~0UL >> size; if (tmp != ~0UL) - goto found_middle; - result += BITS_PER_LONG; + goto found; +pass: + if (size <= BITS_PER_LONG) + break; size -= BITS_PER_LONG; + offset = 0; p++; } - if (!size) - return result; - - tmp = __reverse_ulong((unsigned char *)p); -found_first: - tmp |= ~(~0UL << (BITS_PER_LONG - size)); - if (tmp == ~0UL) /* Are any bits zero? */ - return result + size; /* Nope. */ -found_middle: - return result + __reverse_ffz(tmp); + return result; +found: + return result - size + __reverse_ffz(tmp); } void register_inmem_page(struct inode *inode, struct page *page) @@ -211,69 +191,149 @@ void register_inmem_page(struct inode *inode, struct page *page) trace_f2fs_register_inmem_page(page, INMEM); } -int commit_inmem_pages(struct inode *inode, bool abort) +static int __revoke_inmem_pages(struct inode *inode, + struct list_head *head, bool drop, bool recover) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct inmem_pages *cur, *tmp; + int err = 0; + + list_for_each_entry_safe(cur, tmp, head, list) { + struct page *page = cur->page; + + if (drop) + trace_f2fs_commit_inmem_page(page, INMEM_DROP); + + lock_page(page); + + if (recover) { + struct dnode_of_data dn; + struct node_info ni; + + trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); + + set_new_dnode(&dn, inode, NULL, NULL, 0); + if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = -EAGAIN; + goto next; + } + get_node_info(sbi, dn.nid, &ni); + f2fs_replace_block(sbi, &dn, dn.data_blkaddr, + cur->old_addr, ni.version, true, true); + f2fs_put_dnode(&dn); + } +next: + /* we don't need to invalidate this in the sccessful status */ + if (drop || recover) + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 1); + + list_del(&cur->list); + kmem_cache_free(inmem_entry_slab, cur); + dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + } + return err; +} + +void drop_inmem_pages(struct inode *inode) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + + mutex_lock(&fi->inmem_lock); + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); + mutex_unlock(&fi->inmem_lock); +} + +static int __commit_inmem_pages(struct inode *inode, + struct list_head *revoke_list) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct inmem_pages *cur, *tmp; - bool submit_bio = false; struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, .rw = WRITE_SYNC | REQ_PRIO, .encrypted_page = NULL, }; + bool submit_bio = false; int err = 0; - /* - * The abort is true only when f2fs_evict_inode is called. - * Basically, the f2fs_evict_inode doesn't produce any data writes, so - * that we don't need to call f2fs_balance_fs. - * Otherwise, f2fs_gc in f2fs_balance_fs can wait forever until this - * inode becomes free by iget_locked in f2fs_iget. - */ - if (!abort) { - f2fs_balance_fs(sbi); - f2fs_lock_op(sbi); + list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { + struct page *page = cur->page; + + lock_page(page); + if (page->mapping == inode->i_mapping) { + trace_f2fs_commit_inmem_page(page, INMEM); + + set_page_dirty(page); + f2fs_wait_on_page_writeback(page, DATA, true); + if (clear_page_dirty_for_io(page)) + inode_dec_dirty_pages(inode); + + fio.page = page; + err = do_write_data_page(&fio); + if (err) { + unlock_page(page); + break; + } + + /* record old blkaddr for revoking */ + cur->old_addr = fio.old_blkaddr; + + clear_cold_data(page); + submit_bio = true; + } + unlock_page(page); + list_move_tail(&cur->list, revoke_list); } - mutex_lock(&fi->inmem_lock); - list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { - lock_page(cur->page); - if (!abort) { - if (cur->page->mapping == inode->i_mapping) { - set_page_dirty(cur->page); - f2fs_wait_on_page_writeback(cur->page, DATA); - if (clear_page_dirty_for_io(cur->page)) - inode_dec_dirty_pages(inode); - trace_f2fs_commit_inmem_page(cur->page, INMEM); - fio.page = cur->page; - err = do_write_data_page(&fio); - if (err) { - unlock_page(cur->page); - break; - } - clear_cold_data(cur->page); - submit_bio = true; - } - } else { - trace_f2fs_commit_inmem_page(cur->page, INMEM_DROP); - } - set_page_private(cur->page, 0); - ClearPagePrivate(cur->page); - f2fs_put_page(cur->page, 1); + if (submit_bio) + f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); - list_del(&cur->list); - kmem_cache_free(inmem_entry_slab, cur); - dec_page_count(F2FS_I_SB(inode), F2FS_INMEM_PAGES); + if (!err) + __revoke_inmem_pages(inode, revoke_list, false, false); + + return err; +} + +int commit_inmem_pages(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_inode_info *fi = F2FS_I(inode); + struct list_head revoke_list; + int err; + + INIT_LIST_HEAD(&revoke_list); + f2fs_balance_fs(sbi, true); + f2fs_lock_op(sbi); + + mutex_lock(&fi->inmem_lock); + err = __commit_inmem_pages(inode, &revoke_list); + if (err) { + int ret; + /* + * try to revoke all committed pages, but still we could fail + * due to no memory or other reason, if that happened, EAGAIN + * will be returned, which means in such case, transaction is + * already not integrity, caller should use journal to do the + * recovery or rewrite & commit last transaction. For other + * error number, revoking was done by filesystem itself. + */ + ret = __revoke_inmem_pages(inode, &revoke_list, false, true); + if (ret) + err = ret; + + /* drop all uncommitted pages */ + __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); } mutex_unlock(&fi->inmem_lock); - if (!abort) { - f2fs_unlock_op(sbi); - if (submit_bio) - f2fs_submit_merged_bio(sbi, DATA, WRITE); - } + f2fs_unlock_op(sbi); return err; } @@ -281,13 +341,25 @@ int commit_inmem_pages(struct inode *inode, bool abort) * This function balances dirty node and dentry pages. * In addition, it controls garbage collection. */ -void f2fs_balance_fs(struct f2fs_sb_info *sbi) +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_CHECKPOINT)) + f2fs_stop_checkpoint(sbi, false); +#endif + + if (!need) + return; + + /* balance_fs_bg is able to be pending */ + if (excess_cached_nats(sbi)) + f2fs_balance_fs_bg(sbi); + /* * We should do GC or end up with checkpoint, if there are so many dirty * dir/node pages without enough free segments. */ - if (has_not_enough_free_secs(sbi, 0)) { + if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); f2fs_gc(sbi, false); } @@ -304,14 +376,26 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) try_to_free_nats(sbi, NAT_ENTRY_PER_BLOCK); if (!available_free_memory(sbi, FREE_NIDS)) - try_to_free_nids(sbi, NAT_ENTRY_PER_BLOCK * FREE_NID_PAGES); + try_to_free_nids(sbi, MAX_FREE_NIDS); + else + build_free_nids(sbi); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || - excess_prefree_segs(sbi) || !available_free_memory(sbi, INO_ENTRIES) || - jiffies > sbi->cp_expires) + excess_prefree_segs(sbi) || + excess_dirty_nats(sbi) || + (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + if (test_opt(sbi, DATA_FLUSH)) { + struct blk_plug plug; + + blk_start_plug(&plug); + sync_dirty_inodes(sbi, FILE_INODE); + blk_finish_plug(&plug); + } f2fs_sync_fs(sbi->sb, true); + stat_inc_bg_cp_count(sbi->stat_info); + } } static int issue_flush_thread(void *data) @@ -361,24 +445,28 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE)) { + if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { struct bio *bio = f2fs_bio_alloc(0); int ret; + atomic_inc(&fcc->submit_flush); bio->bi_bdev = sbi->sb->s_bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); + atomic_dec(&fcc->submit_flush); bio_put(bio); return ret; } init_completion(&cmd.wait); + atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); return cmd.ret; } @@ -392,6 +480,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; + atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; @@ -513,28 +602,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } -bool discard_next_dnode(struct f2fs_sb_info *sbi, block_t blkaddr) -{ - int err = -ENOTSUPP; - - if (test_opt(sbi, DISCARD)) { - struct seg_entry *se = get_seg_entry(sbi, - GET_SEGNO(sbi, blkaddr)); - unsigned int offset = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); - - if (f2fs_test_bit(offset, se->discard_map)) - return false; - - err = f2fs_issue_discard(sbi, blkaddr, 1); - } - - if (err) { - update_meta_page(sbi, NULL, blkaddr); - return true; - } - return false; -} - static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) @@ -573,7 +640,7 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) bool force = (cpc->reason == CP_DISCARD); int i; - if (se->valid_blocks == max_blocks) + if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) return; if (!force) { @@ -593,6 +660,10 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) break; end = __find_rev_next_zero_bit(dmap, max_blocks, start + 1); + if (force && start && end != max_blocks + && (end - start) < cpc->trim_minlen) + continue; + __add_discard_entry(sbi, cpc, se, start, end); } } @@ -630,6 +701,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; + unsigned int secno, start_segno; + bool force = (cpc->reason == CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -646,17 +719,31 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (!test_opt(sbi, DISCARD)) + if (force || !test_opt(sbi, DISCARD)) continue; - f2fs_issue_discard(sbi, START_BLOCK(sbi, start), + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { + f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); + continue; + } +next: + secno = GET_SECNO(sbi, start); + start_segno = secno * sbi->segs_per_sec; + if (!IS_CURSEC(sbi, secno) && + !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), + sbi->segs_per_sec << sbi->log_blocks_per_seg); + + start = start_segno + sbi->segs_per_sec; + if (start < end) + goto next; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (cpc->reason == CP_DISCARD && entry->len < cpc->trim_minlen) + if (force && entry->len < cpc->trim_minlen) goto skip; f2fs_issue_discard(sbi, entry->blkaddr, entry->len); cpc->trimmed += entry->len; @@ -711,12 +798,14 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) if (del > 0) { if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (!f2fs_test_and_set_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) f2fs_bug_on(sbi, 1); - if (f2fs_test_and_clear_bit(offset, se->discard_map)) + if (f2fs_discard_en(sbi) && + f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; } if (!f2fs_test_bit(offset, se->ckpt_valid_map)) @@ -817,12 +906,12 @@ int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) } } - sum_in_page = (PAGE_CACHE_SIZE - 2 * SUM_JOURNAL_SIZE - + sum_in_page = (PAGE_SIZE - 2 * SUM_JOURNAL_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE; if (valid_sum_count <= sum_in_page) return 1; else if ((valid_sum_count - sum_in_page) <= - (PAGE_CACHE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) + (PAGE_SIZE - SUM_FOOTER_SIZE) / SUMMARY_SIZE) return 2; return 3; } @@ -841,9 +930,9 @@ void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) void *dst = page_address(page); if (src) - memcpy(dst, src, PAGE_CACHE_SIZE); + memcpy(dst, src, PAGE_SIZE); else - memset(dst, 0, PAGE_CACHE_SIZE); + memset(dst, 0, PAGE_SIZE); set_page_dirty(page); f2fs_put_page(page, 1); } @@ -854,6 +943,31 @@ static void write_sum_page(struct f2fs_sb_info *sbi, update_meta_page(sbi, (void *)sum_blk, blk_addr); } +static void write_current_sum_page(struct f2fs_sb_info *sbi, + int type, block_t blk_addr) +{ + struct curseg_info *curseg = CURSEG_I(sbi, type); + struct page *page = grab_meta_page(sbi, blk_addr); + struct f2fs_summary_block *src = curseg->sum_blk; + struct f2fs_summary_block *dst; + + dst = (struct f2fs_summary_block *)page_address(page); + + mutex_lock(&curseg->curseg_mutex); + + down_read(&curseg->journal_rwsem); + memcpy(&dst->journal, curseg->journal, SUM_JOURNAL_SIZE); + up_read(&curseg->journal_rwsem); + + memcpy(dst->entries, src->entries, SUM_ENTRY_SIZE); + memcpy(&dst->footer, &src->footer, SUM_FOOTER_SIZE); + + mutex_unlock(&curseg->curseg_mutex); + + set_page_dirty(page); + f2fs_put_page(page, 1); +} + static int is_next_segment_free(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -886,9 +1000,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - MAIN_SEGS(sbi), *newseg + 1); - if (segno - *newseg < sbi->segs_per_sec - - (*newseg % sbi->segs_per_sec)) + (hint + 1) * sbi->segs_per_sec, *newseg + 1); + if (segno < (hint + 1) * sbi->segs_per_sec) goto got_it; } find_other_zone: @@ -1071,7 +1184,7 @@ static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0)) + if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) return v_ops->get_victim(sbi, &(curseg)->next_segno, BG_GC, type, SSR); @@ -1120,6 +1233,9 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + if (test_opt(sbi, LFS)) + return; + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) __allocate_new_segments(sbi, i); } @@ -1134,6 +1250,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) __u64 end = start + F2FS_BYTES_TO_BLK(range->len) - 1; unsigned int start_segno, end_segno; struct cp_control cpc; + int err = 0; if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize) return -EINVAL; @@ -1142,6 +1259,12 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) if (end <= MAIN_BLKADDR(sbi)) goto out; + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK)) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Found FS corruption, run fsck to fix."); + goto out; + } + /* start/end segment number in main_area */ start_segno = (start <= MAIN_BLKADDR(sbi)) ? 0 : GET_SEGNO(sbi, start); end_segno = (end >= MAX_BLKADDR(sbi)) ? MAIN_SEGS(sbi) - 1 : @@ -1164,12 +1287,16 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) sbi->segs_per_sec) - 1, end_segno); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); + if (err) + break; + + schedule(); } out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); - return 0; + return err; } static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) @@ -1256,7 +1383,7 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, /* direct_io'ed data is aligned to the segment for better performance */ if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0)) + !has_not_enough_free_secs(sbi, 0, 0)) __allocate_new_segments(sbi, type); *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); @@ -1292,11 +1419,17 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); - allocate_data_block(fio->sbi, fio->page, fio->blk_addr, - &fio->blk_addr, sum, type); + if (fio->type == NODE || fio->type == DATA) + mutex_lock(&fio->sbi->wio_mutex[fio->type]); + + allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, + &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ f2fs_submit_page_mbio(fio); + + if (fio->type == NODE || fio->type == DATA) + mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) @@ -1305,7 +1438,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .sbi = sbi, .type = META, .rw = WRITE_SYNC | REQ_META | REQ_PRIO, - .blk_addr = page->index, + .old_blkaddr = page->index, + .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, }; @@ -1335,19 +1469,19 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); - dn->data_blkaddr = fio->blk_addr; + f2fs_update_data_blkaddr(dn, fio->new_blkaddr); } void rewrite_data_page(struct f2fs_io_info *fio) { + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); f2fs_submit_page_mbio(fio); } -static void __f2fs_replace_block(struct f2fs_sb_info *sbi, - struct f2fs_summary *sum, +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, block_t old_blkaddr, block_t new_blkaddr, - bool recover_curseg) + bool recover_curseg, bool recover_newaddr) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg; @@ -1390,7 +1524,7 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); __add_sum_entry(sbi, type, sum); - if (!recover_curseg) + if (!recover_curseg || recover_newaddr) update_sit_entry(sbi, new_blkaddr, 1); if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO) update_sit_entry(sbi, old_blkaddr, -1); @@ -1414,66 +1548,30 @@ static void __f2fs_replace_block(struct f2fs_sb_info *sbi, void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, block_t old_addr, block_t new_addr, - unsigned char version, bool recover_curseg) + unsigned char version, bool recover_curseg, + bool recover_newaddr) { struct f2fs_summary sum; set_summary(&sum, dn->nid, dn->ofs_in_node, version); - __f2fs_replace_block(sbi, &sum, old_addr, new_addr, recover_curseg); + __f2fs_replace_block(sbi, &sum, old_addr, new_addr, + recover_curseg, recover_newaddr); - dn->data_blkaddr = new_addr; - set_data_blkaddr(dn); - f2fs_update_extent_cache(dn); -} - -static inline bool is_merged_page(struct f2fs_sb_info *sbi, - struct page *page, enum page_type type) -{ - enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - struct bio_vec *bvec; - struct page *target; - int i; - - down_read(&io->io_rwsem); - if (!io->bio) { - up_read(&io->io_rwsem); - return false; - } - - bio_for_each_segment_all(bvec, io->bio, i) { - - if (bvec->bv_page->mapping) { - target = bvec->bv_page; - } else { - struct f2fs_crypto_ctx *ctx; - - /* encrypted page */ - ctx = (struct f2fs_crypto_ctx *)page_private( - bvec->bv_page); - target = ctx->w.control_page; - } - - if (page == target) { - up_read(&io->io_rwsem); - return true; - } - } - - up_read(&io->io_rwsem); - return false; + f2fs_update_data_blkaddr(dn, new_addr); } void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type) + enum page_type type, bool ordered) { if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - if (is_merged_page(sbi, page, type)) - f2fs_submit_merged_bio(sbi, type, WRITE); - wait_on_page_writeback(page); + f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + if (ordered) + wait_on_page_writeback(page); + else + wait_for_stable_page(page); } } @@ -1482,14 +1580,12 @@ void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, { struct page *cpage; - if (blkaddr == NEW_ADDR) + if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) return; - f2fs_bug_on(sbi, blkaddr == NULL_ADDR); - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA); + f2fs_wait_on_page_writeback(cpage, DATA, true); f2fs_put_page(cpage, 1); } } @@ -1510,12 +1606,11 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(&seg_i->sum_blk->n_nats, kaddr, SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr, SUM_JOURNAL_SIZE); /* Step 2: restore sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(&seg_i->sum_blk->n_sits, kaddr + SUM_JOURNAL_SIZE, - SUM_JOURNAL_SIZE); + memcpy(seg_i->journal, kaddr + SUM_JOURNAL_SIZE, SUM_JOURNAL_SIZE); offset = 2 * SUM_JOURNAL_SIZE; /* Step 3: restore summary entries */ @@ -1539,7 +1634,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; - if (offset + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (offset + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1611,7 +1706,14 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) /* set uncompleted segment to curseg */ curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); - memcpy(curseg->sum_blk, sum, PAGE_CACHE_SIZE); + + /* update journal info */ + down_write(&curseg->journal_rwsem); + memcpy(curseg->journal, &sum->journal, SUM_JOURNAL_SIZE); + up_write(&curseg->journal_rwsem); + + memcpy(curseg->sum_blk->entries, sum->entries, SUM_ENTRY_SIZE); + memcpy(&curseg->sum_blk->footer, &sum->footer, SUM_FOOTER_SIZE); curseg->next_segno = segno; reset_curseg(sbi, type, 0); curseg->alloc_type = ckpt->alloc_type[type]; @@ -1623,14 +1725,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { - struct f2fs_summary_block *s_sits = - CURSEG_I(sbi, CURSEG_COLD_DATA)->sum_blk; - struct f2fs_summary_block *s_nats = - CURSEG_I(sbi, CURSEG_HOT_DATA)->sum_blk; int type = CURSEG_HOT_DATA; int err; - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) { + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) { int npages = npages_for_summary_flush(sbi, true); if (npages >= 2) @@ -1653,11 +1751,6 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } - /* sanity check for summary blocks */ - if (nats_in_cursum(s_nats) > NAT_JOURNAL_ENTRIES || - sits_in_cursum(s_sits) > SIT_JOURNAL_ENTRIES) - return -EINVAL; - return 0; } @@ -1675,13 +1768,12 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 1: write nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); - memcpy(kaddr, &seg_i->sum_blk->n_nats, SUM_JOURNAL_SIZE); + memcpy(kaddr, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 2: write sit cache */ seg_i = CURSEG_I(sbi, CURSEG_COLD_DATA); - memcpy(kaddr + written_size, &seg_i->sum_blk->n_sits, - SUM_JOURNAL_SIZE); + memcpy(kaddr + written_size, seg_i->journal, SUM_JOURNAL_SIZE); written_size += SUM_JOURNAL_SIZE; /* Step 3: write summary entries */ @@ -1703,7 +1795,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) *summary = seg_i->sum_blk->entries[j]; written_size += SUMMARY_SIZE; - if (written_size + SUMMARY_SIZE <= PAGE_CACHE_SIZE - + if (written_size + SUMMARY_SIZE <= PAGE_SIZE - SUM_FOOTER_SIZE) continue; @@ -1727,17 +1819,13 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, else end = type + NR_CURSEG_NODE_TYPE; - for (i = type; i < end; i++) { - struct curseg_info *sum = CURSEG_I(sbi, i); - mutex_lock(&sum->curseg_mutex); - write_sum_page(sbi, sum->sum_blk, blkaddr + (i - type)); - mutex_unlock(&sum->curseg_mutex); - } + for (i = type; i < end; i++) + write_current_sum_page(sbi, i, blkaddr + (i - type)); } void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk) { - if (is_set_ckpt_flags(F2FS_CKPT(sbi), CP_COMPACT_SUM_FLAG)) + if (is_set_ckpt_flags(sbi, CP_COMPACT_SUM_FLAG)) write_compacted_summaries(sbi, start_blk); else write_normal_summaries(sbi, start_blk, CURSEG_HOT_DATA); @@ -1748,24 +1836,24 @@ void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk) write_normal_summaries(sbi, start_blk, CURSEG_HOT_NODE); } -int lookup_journal_in_cursum(struct f2fs_summary_block *sum, int type, +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, unsigned int val, int alloc) { int i; if (type == NAT_JOURNAL) { - for (i = 0; i < nats_in_cursum(sum); i++) { - if (le32_to_cpu(nid_in_journal(sum, i)) == val) + for (i = 0; i < nats_in_cursum(journal); i++) { + if (le32_to_cpu(nid_in_journal(journal, i)) == val) return i; } - if (alloc && nats_in_cursum(sum) < NAT_JOURNAL_ENTRIES) - return update_nats_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, NAT_JOURNAL)) + return update_nats_in_cursum(journal, 1); } else if (type == SIT_JOURNAL) { - for (i = 0; i < sits_in_cursum(sum); i++) - if (le32_to_cpu(segno_in_journal(sum, i)) == val) + for (i = 0; i < sits_in_cursum(journal); i++) + if (le32_to_cpu(segno_in_journal(journal, i)) == val) return i; - if (alloc && sits_in_cursum(sum) < SIT_JOURNAL_ENTRIES) - return update_sits_in_cursum(sum, 1); + if (alloc && __has_cursum_space(journal, 1, SIT_JOURNAL)) + return update_sits_in_cursum(journal, 1); } return -1; } @@ -1794,7 +1882,7 @@ static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, src_addr = page_address(src_page); dst_addr = page_address(dst_page); - memcpy(dst_addr, src_addr, PAGE_CACHE_SIZE); + memcpy(dst_addr, src_addr, PAGE_SIZE); set_page_dirty(dst_page); f2fs_put_page(src_page, 1); @@ -1869,20 +1957,22 @@ static void add_sits_in_set(struct f2fs_sb_info *sbi) static void remove_sits_in_journal(struct f2fs_sb_info *sbi) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; int i; - for (i = sits_in_cursum(sum) - 1; i >= 0; i--) { + down_write(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { unsigned int segno; bool dirtied; - segno = le32_to_cpu(segno_in_journal(sum, i)); + segno = le32_to_cpu(segno_in_journal(journal, i)); dirtied = __mark_sit_entry_dirty(sbi, segno); if (!dirtied) add_sit_entry(segno, &SM_I(sbi)->sit_entry_set); } - update_sits_in_cursum(sum, -sits_in_cursum(sum)); + update_sits_in_cursum(journal, -i); + up_write(&curseg->journal_rwsem); } /* @@ -1894,13 +1984,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct sit_info *sit_i = SIT_I(sbi); unsigned long *bitmap = sit_i->dirty_sentries_bitmap; struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; struct sit_entry_set *ses, *tmp; struct list_head *head = &SM_I(sbi)->sit_entry_set; bool to_journal = true; struct seg_entry *se; - mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); if (!sit_i->dirty_sentries) @@ -1917,7 +2006,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and add and account * them in sit entry set. */ - if (!__has_cursum_space(sum, sit_i->dirty_sentries, SIT_JOURNAL)) + if (!__has_cursum_space(journal, sit_i->dirty_sentries, SIT_JOURNAL)) remove_sits_in_journal(sbi); /* @@ -1934,10 +2023,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) unsigned int segno = start_segno; if (to_journal && - !__has_cursum_space(sum, ses->entry_cnt, SIT_JOURNAL)) + !__has_cursum_space(journal, ses->entry_cnt, SIT_JOURNAL)) to_journal = false; - if (!to_journal) { + if (to_journal) { + down_write(&curseg->journal_rwsem); + } else { page = get_next_sit_page(sbi, start_segno); raw_sit = page_address(page); } @@ -1955,13 +2046,13 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) } if (to_journal) { - offset = lookup_journal_in_cursum(sum, + offset = lookup_journal_in_cursum(journal, SIT_JOURNAL, segno, 1); f2fs_bug_on(sbi, offset < 0); - segno_in_journal(sum, offset) = + segno_in_journal(journal, offset) = cpu_to_le32(segno); seg_info_to_raw_sit(se, - &sit_in_journal(sum, offset)); + &sit_in_journal(journal, offset)); } else { sit_offset = SIT_ENTRY_OFFSET(sit_i, segno); seg_info_to_raw_sit(se, @@ -1973,7 +2064,9 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) ses->entry_cnt--; } - if (!to_journal) + if (to_journal) + up_write(&curseg->journal_rwsem); + else f2fs_put_page(page, 1); f2fs_bug_on(sbi, ses->entry_cnt); @@ -1988,7 +2081,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) add_discard_addrs(sbi, cpc); } mutex_unlock(&sit_i->sentry_lock); - mutex_unlock(&curseg->curseg_mutex); set_prefree_as_free_segments(sbi); } @@ -2024,12 +2116,16 @@ static int build_sit_info(struct f2fs_sb_info *sbi) = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); sit_i->sentries[start].ckpt_valid_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); - sit_i->sentries[start].discard_map - = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); if (!sit_i->sentries[start].cur_valid_map || - !sit_i->sentries[start].ckpt_valid_map || - !sit_i->sentries[start].discard_map) + !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; + + if (f2fs_discard_en(sbi)) { + sit_i->sentries[start].discard_map + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].discard_map) + return -ENOMEM; + } } sit_i->tmp_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2117,9 +2213,14 @@ static int build_curseg(struct f2fs_sb_info *sbi) for (i = 0; i < NR_CURSEG_TYPE; i++) { mutex_init(&array[i].curseg_mutex); - array[i].sum_blk = kzalloc(PAGE_CACHE_SIZE, GFP_KERNEL); + array[i].sum_blk = kzalloc(PAGE_SIZE, GFP_KERNEL); if (!array[i].sum_blk) return -ENOMEM; + init_rwsem(&array[i].journal_rwsem); + array[i].journal = kzalloc(sizeof(struct f2fs_journal), + GFP_KERNEL); + if (!array[i].journal) + return -ENOMEM; array[i].segno = NULL_SEGNO; array[i].next_blkoff = 0; } @@ -2130,11 +2231,13 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_COLD_DATA); - struct f2fs_summary_block *sum = curseg->sum_blk; + struct f2fs_journal *journal = curseg->journal; + struct seg_entry *se; + struct f2fs_sit_entry sit; int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi); + int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); @@ -2143,41 +2246,58 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) end = (start_blk + readed) * sit_i->sents_per_block; for (; start < end && start < MAIN_SEGS(sbi); start++) { - struct seg_entry *se = &sit_i->sentries[start]; struct f2fs_sit_block *sit_blk; - struct f2fs_sit_entry sit; struct page *page; - mutex_lock(&curseg->curseg_mutex); - for (i = 0; i < sits_in_cursum(sum); i++) { - if (le32_to_cpu(segno_in_journal(sum, i)) - == start) { - sit = sit_in_journal(sum, i); - mutex_unlock(&curseg->curseg_mutex); - goto got_it; - } - } - mutex_unlock(&curseg->curseg_mutex); - + se = &sit_i->sentries[start]; page = get_current_sit_page(sbi, start); sit_blk = (struct f2fs_sit_block *)page_address(page); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; f2fs_put_page(page, 1); -got_it: + check_block_count(sbi, start, &sit); seg_info_from_raw_sit(se, &sit); /* build discard map only one time */ - memcpy(se->discard_map, se->cur_valid_map, SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - se->valid_blocks; - - if (sbi->segs_per_sec > 1) { - struct sec_entry *e = get_sec_entry(sbi, start); - e->valid_blocks += se->valid_blocks; + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += sbi->blocks_per_seg - + se->valid_blocks; } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks; } start_blk += readed; } while (start_blk < sit_blk_cnt); + + down_read(&curseg->journal_rwsem); + for (i = 0; i < sits_in_cursum(journal); i++) { + unsigned int old_valid_blocks; + + start = le32_to_cpu(segno_in_journal(journal, i)); + se = &sit_i->sentries[start]; + sit = sit_in_journal(journal, i); + + old_valid_blocks = se->valid_blocks; + + check_block_count(sbi, start, &sit); + seg_info_from_raw_sit(se, &sit); + + if (f2fs_discard_en(sbi)) { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } + + if (sbi->segs_per_sec > 1) + get_sec_entry(sbi, start)->valid_blocks += + se->valid_blocks - old_valid_blocks; + } + up_read(&curseg->journal_rwsem); } static void init_free_segmap(struct f2fs_sb_info *sbi) @@ -2310,7 +2430,11 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ssa_blkaddr = le32_to_cpu(raw_super->ssa_blkaddr); sm_info->rec_prefree_segments = sm_info->main_segments * DEF_RECLAIM_PREFREE_SEGMENTS / 100; - sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; + if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS) + sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS; + + if (!test_opt(sbi, LFS)) + sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; @@ -2392,8 +2516,10 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) if (!array) return; SM_I(sbi)->curseg_array = NULL; - for (i = 0; i < NR_CURSEG_TYPE; i++) + for (i = 0; i < NR_CURSEG_TYPE; i++) { kfree(array[i].sum_blk); + kfree(array[i].journal); + } kfree(array); } @@ -2459,7 +2585,7 @@ int __init create_segment_manager_caches(void) sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destory_discard_entry; + goto destroy_discard_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2469,7 +2595,7 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destory_discard_entry: +destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: return -ENOMEM; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index ee44d346ea44..fecb856ad874 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -16,6 +16,7 @@ #define NULL_SECNO ((unsigned int)(~0)) #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ +#define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) @@ -158,16 +159,17 @@ struct victim_sel_policy { }; struct seg_entry { - unsigned short valid_blocks; /* # of valid blocks */ + unsigned int type:6; /* segment type like CURSEG_XXX_TYPE */ + unsigned int valid_blocks:10; /* # of valid blocks */ + unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ + unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. */ - unsigned short ckpt_valid_blocks; - unsigned char *ckpt_valid_map; + unsigned char *ckpt_valid_map; /* validity bitmap of blocks last cp */ unsigned char *discard_map; - unsigned char type; /* segment type like CURSEG_XXX_TYPE */ unsigned long long mtime; /* modification time of the segment */ }; @@ -183,7 +185,7 @@ struct segment_allocation { * this value is set in page as a private data which indicate that * the page is atomically written, and it is in inmem_pages list. */ -#define ATOMIC_WRITTEN_PAGE 0x0000ffff +#define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) @@ -191,6 +193,7 @@ struct segment_allocation { struct inmem_pages { struct list_head list; struct page *page; + block_t old_addr; /* for revoking when fail to commit */ }; struct sit_info { @@ -257,6 +260,8 @@ struct victim_selection { struct curseg_info { struct mutex curseg_mutex; /* lock for consistency */ struct f2fs_summary_block *sum_blk; /* cached summary block */ + struct rw_semaphore journal_rwsem; /* protect journal area */ + struct f2fs_journal *journal; /* cached journal info */ unsigned char alloc_type; /* current allocation type */ unsigned int segno; /* current segment number */ unsigned short next_blkoff; /* next block offset to write */ @@ -466,20 +471,27 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + + if (test_opt(sbi, LFS)) + return false; + return free_sections(sbi) <= (node_secs + 2 * dent_secs + reserved_sections(sbi) + 1); } -static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, int freed) +static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, + int freed, int needed) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; - return (free_sections(sbi) + freed) <= (node_secs + 2 * dent_secs + - reserved_sections(sbi)); + return (free_sections(sbi) + freed) <= + (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) @@ -527,6 +539,9 @@ static inline bool need_inplace_update(struct inode *inode) if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) return false; + if (test_opt(sbi, LFS)) + return false; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -540,7 +555,7 @@ static inline bool need_inplace_update(struct inode *inode) /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && - is_inode_flag_set(F2FS_I(inode), FI_NEED_IPU)) + is_inode_flag_set(inode, FI_NEED_IPU)) return true; return false; @@ -573,8 +588,8 @@ static inline void check_seg_range(struct f2fs_sb_info *sbi, unsigned int segno) static inline void verify_block_addr(struct f2fs_sb_info *sbi, block_t blk_addr) { - f2fs_bug_on(sbi, blk_addr < SEG0_BLKADDR(sbi) - || blk_addr >= MAX_BLKADDR(sbi)); + BUG_ON(blk_addr < SEG0_BLKADDR(sbi) + || blk_addr >= MAX_BLKADDR(sbi)); } /* @@ -702,9 +717,9 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) if (type == DATA) return sbi->blocks_per_seg; else if (type == NODE) - return 3 * sbi->blocks_per_seg; + return 8 * sbi->blocks_per_seg; else if (type == META) - return MAX_BIO_BLOCKS(sbi); + return 8 * MAX_BIO_BLOCKS(sbi); else return 0; } @@ -722,10 +737,8 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, nr_to_write = wbc->nr_to_write; - if (type == DATA) - desired = 4096; - else if (type == NODE) - desired = 3 * max_hw_blocks(sbi); + if (type == NODE) + desired = 2 * max_hw_blocks(sbi); else desired = MAX_BIO_BLOCKS(sbi); diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index da0d8e0b55a5..46c915425923 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -13,6 +13,7 @@ #include #include "f2fs.h" +#include "node.h" static LIST_HEAD(f2fs_list); static DEFINE_SPINLOCK(f2fs_list_lock); @@ -25,14 +26,15 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > NAT_ENTRY_PER_BLOCK) - return NM_I(sbi)->fcnt - NAT_ENTRY_PER_BLOCK; + if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) + return NM_I(sbi)->fcnt - MAX_FREE_NIDS; return 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) { - return sbi->total_ext_tree + atomic_read(&sbi->total_ext_node); + return atomic_read(&sbi->total_zombie_tree) + + atomic_read(&sbi->total_ext_node); } unsigned long f2fs_shrink_count(struct shrinker *shrink, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4f666368aa85..fd249cc9b96e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -39,6 +39,35 @@ static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; static struct kset *f2fs_kset; +#ifdef CONFIG_F2FS_FAULT_INJECTION + +char *fault_name[FAULT_MAX] = { + [FAULT_KMALLOC] = "kmalloc", + [FAULT_PAGE_ALLOC] = "page alloc", + [FAULT_ALLOC_NID] = "alloc nid", + [FAULT_ORPHAN] = "orphan", + [FAULT_BLOCK] = "no more block", + [FAULT_DIR_DEPTH] = "too big dir depth", + [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_IO] = "IO error", + [FAULT_CHECKPOINT] = "checkpoint error", +}; + +static void f2fs_build_fault_attr(struct f2fs_sb_info *sbi, + unsigned int rate) +{ + struct f2fs_fault_info *ffi = &sbi->fault_info; + + if (rate) { + atomic_set(&ffi->inject_ops, 0); + ffi->inject_rate = rate; + ffi->inject_type = (1 << FAULT_MAX) - 1; + } else { + memset(ffi, 0, sizeof(struct f2fs_fault_info)); + } +} +#endif + /* f2fs-wide shrinker description */ static struct shrinker f2fs_shrinker_info = { .scan_objects = f2fs_shrink_scan, @@ -51,6 +80,7 @@ enum { Opt_disable_roll_forward, Opt_norecovery, Opt_discard, + Opt_nodiscard, Opt_noheap, Opt_user_xattr, Opt_nouser_xattr, @@ -61,12 +91,19 @@ enum { Opt_inline_xattr, Opt_inline_data, Opt_inline_dentry, + Opt_noinline_dentry, Opt_flush_merge, + Opt_noflush_merge, Opt_nobarrier, Opt_fastboot, Opt_extent_cache, Opt_noextent_cache, Opt_noinline_data, + Opt_data_flush, + Opt_mode, + Opt_fault_injection, + Opt_lazytime, + Opt_nolazytime, Opt_err, }; @@ -75,6 +112,7 @@ static match_table_t f2fs_tokens = { {Opt_disable_roll_forward, "disable_roll_forward"}, {Opt_norecovery, "norecovery"}, {Opt_discard, "discard"}, + {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, @@ -85,12 +123,19 @@ static match_table_t f2fs_tokens = { {Opt_inline_xattr, "inline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, + {Opt_noinline_dentry, "noinline_dentry"}, {Opt_flush_merge, "flush_merge"}, + {Opt_noflush_merge, "noflush_merge"}, {Opt_nobarrier, "nobarrier"}, {Opt_fastboot, "fastboot"}, {Opt_extent_cache, "extent_cache"}, {Opt_noextent_cache, "noextent_cache"}, {Opt_noinline_data, "noinline_data"}, + {Opt_data_flush, "data_flush"}, + {Opt_mode, "mode=%s"}, + {Opt_fault_injection, "fault_injection=%u"}, + {Opt_lazytime, "lazytime"}, + {Opt_nolazytime, "nolazytime"}, {Opt_err, NULL}, }; @@ -100,6 +145,10 @@ enum { SM_INFO, /* struct f2fs_sm_info */ NM_INFO, /* struct f2fs_nm_info */ F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif }; struct f2fs_attr { @@ -121,9 +170,27 @@ static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) return (unsigned char *)NM_I(sbi); else if (struct_type == F2FS_SBI) return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif return NULL; } +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -157,6 +224,10 @@ static ssize_t f2fs_sbi_store(struct f2fs_attr *a, ret = kstrtoul(skip_spaces(buf), 0, &t); if (ret < 0) return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif *ui = t; return count; } @@ -202,6 +273,9 @@ static struct f2fs_attr f2fs_attr_##_name = { \ f2fs_sbi_show, f2fs_sbi_store, \ offsetof(struct struct_name, elname)) +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); @@ -214,9 +288,16 @@ F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, cp_interval); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -234,7 +315,14 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(dir_level), ATTR_LIST(ram_thresh), ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), NULL, }; @@ -330,6 +418,8 @@ static int parse_options(struct super_block *sb, char *options) "the device does not support discard"); } break; + case Opt_nodiscard: + clear_opt(sbi, DISCARD); case Opt_noheap: set_opt(sbi, NOHEAP); break; @@ -388,9 +478,15 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_dentry: set_opt(sbi, INLINE_DENTRY); break; + case Opt_noinline_dentry: + clear_opt(sbi, INLINE_DENTRY); + break; case Opt_flush_merge: set_opt(sbi, FLUSH_MERGE); break; + case Opt_noflush_merge: + clear_opt(sbi, FLUSH_MERGE); + break; case Opt_nobarrier: set_opt(sbi, NOBARRIER); break; @@ -406,6 +502,42 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noinline_data: clear_opt(sbi, INLINE_DATA); break; + case Opt_data_flush: + set_opt(sbi, DATA_FLUSH); + break; + case Opt_mode: + name = match_strdup(&args[0]); + + if (!name) + return -ENOMEM; + if (strlen(name) == 8 && + !strncmp(name, "adaptive", 8)) { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } else if (strlen(name) == 3 && + !strncmp(name, "lfs", 3)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + } else { + kfree(name); + return -EINVAL; + } + kfree(name); + break; + case Opt_fault_injection: + if (args->from && match_int(args, &arg)) + return -EINVAL; +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, arg); +#else + f2fs_msg(sb, KERN_INFO, + "FAULT_INJECTION was not selected"); +#endif + break; + case Opt_lazytime: + sb->s_flags |= MS_LAZYTIME; + break; + case Opt_nolazytime: + sb->s_flags &= ~MS_LAZYTIME; + break; default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -426,26 +558,25 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); + if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { + kmem_cache_free(f2fs_inode_cachep, fi); + return NULL; + } + /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; - atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); + INIT_LIST_HEAD(&fi->dirty_list); + INIT_LIST_HEAD(&fi->gdirty_list); INIT_LIST_HEAD(&fi->inmem_pages); mutex_init(&fi->inmem_lock); - - set_inode_flag(fi, FI_NEW_INODE); - - if (test_opt(F2FS_SB(sb), INLINE_XATTR)) - set_inode_flag(fi, FI_INLINE_XATTR); + init_rwsem(&fi->dio_rwsem[READ]); + init_rwsem(&fi->dio_rwsem[WRITE]); /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - fi->i_crypt_info = NULL; -#endif return &fi->vfs_inode; } @@ -458,7 +589,7 @@ static int f2fs_drop_inode(struct inode *inode) * - f2fs_gc -> iput -> evict * - inode_wait_for_writeback(inode) */ - if (!inode_unhashed(inode) && inode->i_state & I_SYNC) { + if ((!inode_unhashed(inode) && inode->i_state & I_SYNC)) { if (!inode->i_nlink && !is_bad_inode(inode)) { /* to avoid evict_inode call simultaneously */ atomic_inc(&inode->i_count); @@ -466,32 +597,66 @@ static int f2fs_drop_inode(struct inode *inode) /* some remained atomic pages should discarded */ if (f2fs_is_atomic_file(inode)) - commit_inmem_pages(inode, true); + drop_inmem_pages(inode); /* should remain fi->extent_tree for writepage */ f2fs_destroy_extent_node(inode); sb_start_intwrite(inode->i_sb); - i_size_write(inode, 0); + f2fs_i_size_write(inode, 0); if (F2FS_HAS_BLOCKS(inode)) - f2fs_truncate(inode, true); + f2fs_truncate(inode); sb_end_intwrite(inode->i_sb); -#ifdef CONFIG_F2FS_FS_ENCRYPTION - if (F2FS_I(inode)->i_crypt_info) - f2fs_free_encryption_info(inode, - F2FS_I(inode)->i_crypt_info); -#endif + fscrypt_put_encryption_info(inode, NULL); spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } return 0; } + return generic_drop_inode(inode); } +int f2fs_inode_dirtied(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return 1; + } + + set_inode_flag(inode, FI_DIRTY_INODE); + list_add_tail(&F2FS_I(inode)->gdirty_list, + &sbi->inode_list[DIRTY_META]); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + stat_inc_dirty_inode(sbi, DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + + return 0; +} + +void f2fs_inode_synced(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + spin_lock(&sbi->inode_lock[DIRTY_META]); + if (!is_inode_flag_set(inode, FI_DIRTY_INODE)) { + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return; + } + list_del_init(&F2FS_I(inode)->gdirty_list); + clear_inode_flag(inode, FI_DIRTY_INODE); + clear_inode_flag(inode, FI_AUTO_RECOVER); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); + spin_unlock(&sbi->inode_lock[DIRTY_META]); +} + /* * f2fs_dirty_inode() is called from __mark_inode_dirty() * @@ -499,7 +664,19 @@ static int f2fs_drop_inode(struct inode *inode) */ static void f2fs_dirty_inode(struct inode *inode, int flags) { - set_inode_flag(F2FS_I(inode), FI_DIRTY_INODE); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_NODE_INO(sbi) || + inode->i_ino == F2FS_META_INO(sbi)) + return; + + if (flags == I_DIRTY_TIME) + return; + + if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) + clear_inode_flag(inode, FI_AUTO_RECOVER); + + f2fs_inode_dirtied(inode); } static void f2fs_i_callback(struct rcu_head *head) @@ -510,15 +687,27 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { + percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } +static void destroy_percpu_info(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < NR_COUNT_TYPE; i++) + percpu_counter_destroy(&sbi->nr_pages[i]); + percpu_counter_destroy(&sbi->alloc_valid_block_count); + percpu_counter_destroy(&sbi->total_valid_inode_count); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } kobject_del(&sbi->s_kobj); @@ -534,7 +723,7 @@ static void f2fs_put_super(struct super_block *sb) * clean checkpoint again. */ if (is_sbi_flag_set(sbi, SBI_IS_DIRTY) || - !is_set_ckpt_flags(F2FS_CKPT(sbi), CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { struct cp_control cpc = { .reason = CP_UMOUNT, }; @@ -548,12 +737,15 @@ static void f2fs_put_super(struct super_block *sb) * normally superblock is clean, so we need to release this. * In addition, EIO will skip do checkpoint, we need this as well. */ - release_dirty_inode(sbi); + release_ino_entry(sbi, true); release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); + /* our cp_error case, we can wait for any writeback page */ + f2fs_flush_merged_bios(sbi); + iput(sbi->node_inode); iput(sbi->meta_inode); @@ -566,13 +758,18 @@ static void f2fs_put_super(struct super_block *sb) wait_for_completion(&sbi->s_kobj_unregister); sb->s_fs_info = NULL; - brelse(sbi->raw_super_buf); + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); + kfree(sbi->raw_super); + + destroy_percpu_info(sbi); kfree(sbi); } int f2fs_sync_fs(struct super_block *sb, int sync) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int err = 0; trace_f2fs_sync_fs(sb, sync); @@ -582,14 +779,12 @@ int f2fs_sync_fs(struct super_block *sb, int sync) cpc.reason = __get_cp_reason(sbi); mutex_lock(&sbi->gc_mutex); - write_checkpoint(sbi, &cpc); + err = write_checkpoint(sbi, &cpc); mutex_unlock(&sbi->gc_mutex); - } else { - f2fs_balance_fs(sbi); } f2fs_trace_ios(NULL, 1); - return 0; + return err; } static int f2fs_freeze(struct super_block *sb) @@ -623,7 +818,7 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bsize = sbi->blocksize; buf->f_blocks = total_count - start_count; - buf->f_bfree = buf->f_blocks - valid_user_blocks(sbi) - ovp_count; + buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -676,6 +871,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",noinline_data"); if (test_opt(sbi, INLINE_DENTRY)) seq_puts(seq, ",inline_dentry"); + else + seq_puts(seq, ",noinline_dentry"); if (!f2fs_readonly(sbi->sb) && test_opt(sbi, FLUSH_MERGE)) seq_puts(seq, ",flush_merge"); if (test_opt(sbi, NOBARRIER)) @@ -686,6 +883,14 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",extent_cache"); else seq_puts(seq, ",noextent_cache"); + if (test_opt(sbi, DATA_FLUSH)) + seq_puts(seq, ",data_flush"); + + seq_puts(seq, ",mode="); + if (test_opt(sbi, ADAPTIVE)) + seq_puts(seq, "adaptive"); + else if (test_opt(sbi, LFS)) + seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); return 0; @@ -718,19 +923,47 @@ static int segment_info_seq_show(struct seq_file *seq, void *offset) return 0; } -static int segment_info_open_fs(struct inode *inode, struct file *file) +static int segment_bits_seq_show(struct seq_file *seq, void *offset) { - return single_open(file, segment_info_seq_show, PDE_DATA(inode)); + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, 1)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; } -static const struct file_operations f2fs_seq_segment_info_fops = { - .owner = THIS_MODULE, - .open = segment_info_open_fs, - .read = seq_read, - .llseek = seq_lseek, - .release = single_release, +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .owner = THIS_MODULE, \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ }; +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); + static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ @@ -738,7 +971,16 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, BG_GC); set_opt(sbi, INLINE_DATA); + set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + sbi->sb->s_flags |= MS_LAZYTIME; + set_opt(sbi, FLUSH_MERGE); + if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + set_opt_mode(sbi, F2FS_MOUNT_LFS); + set_opt(sbi, DISCARD); + } else { + set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); + } #ifdef CONFIG_F2FS_FS_XATTR set_opt(sbi, XATTR_USER); @@ -746,6 +988,10 @@ static void default_options(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FS_POSIX_ACL set_opt(sbi, POSIX_ACL); #endif + +#ifdef CONFIG_F2FS_FAULT_INJECTION + f2fs_build_fault_attr(sbi, 0); +#endif } static int f2fs_remount(struct super_block *sb, int *flags, char *data) @@ -756,8 +1002,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool need_restart_gc = false; bool need_stop_gc = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); - - sync_filesystem(sb); +#ifdef CONFIG_F2FS_FAULT_INJECTION + struct f2fs_fault_info ffi = sbi->fault_info; +#endif /* * Save the old mount options in case we @@ -766,6 +1013,15 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; active_logs = sbi->active_logs; + /* recover superblocks we couldn't write due to previous RO mount */ + if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { + err = f2fs_commit_super(sbi, false); + f2fs_msg(sb, KERN_INFO, + "Try to recover all the superblocks, ret: %d", err); + if (!err) + clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); + } + sbi->mount_opt.opt = 0; default_options(sbi); @@ -797,7 +1053,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & MS_RDONLY) || !test_opt(sbi, BG_GC)) { if (sbi->gc_thread) { stop_gc_thread(sbi); - f2fs_sync_fs(sb, 1); need_restart_gc = true; } } else if (!sbi->gc_thread) { @@ -807,6 +1062,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) need_stop_gc = true; } + if (*flags & MS_RDONLY) { + writeback_inodes_sb(sb, WB_REASON_SYNC); + sync_inodes_sb(sb); + + set_sbi_flag(sbi, SBI_IS_DIRTY); + set_sbi_flag(sbi, SBI_IS_CLOSE); + f2fs_sync_fs(sb, 1); + clear_sbi_flag(sbi, SBI_IS_CLOSE); + } + /* * We stop issue flush thread if FS is mounted as RO * or if flush_merge is not passed in mount option. @@ -820,8 +1085,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } skip: /* Update the POSIXACL Flag */ - sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | + sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); + return 0; restore_gc: if (need_restart_gc) { @@ -834,6 +1100,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) restore_opts: sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; +#ifdef CONFIG_F2FS_FAULT_INJECTION + sbi->fault_info = ffi; +#endif return err; } @@ -853,6 +1122,48 @@ static struct super_operations f2fs_sops = { .remount_fs = f2fs_remount, }; +#ifdef CONFIG_F2FS_FS_ENCRYPTION +static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) +{ + return f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, NULL); +} + +static int f2fs_key_prefix(struct inode *inode, u8 **key) +{ + *key = F2FS_I_SB(inode)->key_prefix; + return F2FS_I_SB(inode)->key_prefix_size; +} + +static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, + void *fs_data) +{ + return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, + F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, + ctx, len, fs_data, XATTR_CREATE); +} + +static unsigned f2fs_max_namelen(struct inode *inode) +{ + return S_ISLNK(inode->i_mode) ? + inode->i_sb->s_blocksize : F2FS_NAME_LEN; +} + +static struct fscrypt_operations f2fs_cryptops = { + .get_context = f2fs_get_context, + .key_prefix = f2fs_key_prefix, + .set_context = f2fs_set_context, + .is_encrypted = f2fs_encrypted_inode, + .empty_dir = f2fs_empty_dir, + .max_namelen = f2fs_max_namelen, +}; +#else +static struct fscrypt_operations f2fs_cryptops = { + .is_encrypted = f2fs_encrypted_inode, +}; +#endif + static struct inode *f2fs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { @@ -898,7 +1209,7 @@ static const struct export_operations f2fs_export_ops = { .get_parent = f2fs_get_parent, }; -static loff_t max_file_size(unsigned bits) +static loff_t max_file_blocks(void) { loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); loff_t leaf_count = ADDRS_PER_BLOCK; @@ -914,13 +1225,29 @@ static loff_t max_file_size(unsigned bits) leaf_count *= NIDS_PER_BLOCK; result += leaf_count; - result <<= bits; return result; } -static inline bool sanity_check_area_boundary(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int __f2fs_commit_super(struct buffer_head *bh, + struct f2fs_super_block *super) { + lock_buffer(bh); + if (super) + memcpy(bh->b_data + F2FS_SUPER_OFFSET, super, sizeof(*super)); + set_buffer_uptodate(bh); + set_buffer_dirty(bh); + unlock_buffer(bh); + + /* it's rare case, we can do fua all the time */ + return __sync_dirty_buffer(bh, WRITE_FLUSH_FUA); +} + +static inline bool sanity_check_area_boundary(struct f2fs_sb_info *sbi, + struct buffer_head *bh) +{ + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; u32 segment0_blkaddr = le32_to_cpu(raw_super->segment0_blkaddr); u32 cp_blkaddr = le32_to_cpu(raw_super->cp_blkaddr); u32 sit_blkaddr = le32_to_cpu(raw_super->sit_blkaddr); @@ -934,6 +1261,10 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, u32 segment_count_main = le32_to_cpu(raw_super->segment_count_main); u32 segment_count = le32_to_cpu(raw_super->segment_count); u32 log_blocks_per_seg = le32_to_cpu(raw_super->log_blocks_per_seg); + u64 main_end_blkaddr = main_blkaddr + + (segment_count_main << log_blocks_per_seg); + u64 seg_end_blkaddr = segment0_blkaddr + + (segment_count << log_blocks_per_seg); if (segment0_blkaddr != cp_blkaddr) { f2fs_msg(sb, KERN_INFO, @@ -978,22 +1309,47 @@ static inline bool sanity_check_area_boundary(struct super_block *sb, return true; } - if (main_blkaddr + (segment_count_main << log_blocks_per_seg) != - segment0_blkaddr + (segment_count << log_blocks_per_seg)) { + if (main_end_blkaddr > seg_end_blkaddr) { f2fs_msg(sb, KERN_INFO, - "Wrong MAIN_AREA boundary, start(%u) end(%u) blocks(%u)", + "Wrong MAIN_AREA boundary, start(%u) end(%u) block(%u)", main_blkaddr, - segment0_blkaddr + (segment_count << log_blocks_per_seg), + segment0_blkaddr + + (segment_count << log_blocks_per_seg), segment_count_main << log_blocks_per_seg); return true; - } + } else if (main_end_blkaddr < seg_end_blkaddr) { + int err = 0; + char *res; + /* fix in-memory information all the time */ + raw_super->segment_count = cpu_to_le32((main_end_blkaddr - + segment0_blkaddr) >> log_blocks_per_seg); + + if (f2fs_readonly(sb) || bdev_read_only(sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + res = "internally"; + } else { + err = __f2fs_commit_super(bh, NULL); + res = err ? "failed" : "done"; + } + f2fs_msg(sb, KERN_INFO, + "Fix alignment : %s, start(%u) end(%u) block(%u)", + res, main_blkaddr, + segment0_blkaddr + + (segment_count << log_blocks_per_seg), + segment_count_main << log_blocks_per_seg); + if (err) + return true; + } return false; } -static int sanity_check_raw_super(struct super_block *sb, - struct f2fs_super_block *raw_super) +static int sanity_check_raw_super(struct f2fs_sb_info *sbi, + struct buffer_head *bh) { + struct f2fs_super_block *raw_super = (struct f2fs_super_block *) + (bh->b_data + F2FS_SUPER_OFFSET); + struct super_block *sb = sbi->sb; unsigned int blocksize; if (F2FS_SUPER_MAGIC != le32_to_cpu(raw_super->magic)) { @@ -1004,10 +1360,10 @@ static int sanity_check_raw_super(struct super_block *sb, } /* Currently, support only 4KB page cache size */ - if (F2FS_BLKSIZE != PAGE_CACHE_SIZE) { + if (F2FS_BLKSIZE != PAGE_SIZE) { f2fs_msg(sb, KERN_INFO, "Invalid page_cache_size (%lu), supports only 4KB\n", - PAGE_CACHE_SIZE); + PAGE_SIZE); return 1; } @@ -1059,27 +1415,18 @@ static int sanity_check_raw_super(struct super_block *sb, return 1; } - if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { - f2fs_msg(sb, KERN_INFO, - "Invalid segment count (%u)", - le32_to_cpu(raw_super->segment_count)); - return 1; - } - /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ - if (sanity_check_area_boundary(sb, raw_super)) + if (sanity_check_area_boundary(sbi, bh)) return 1; return 0; } -static int sanity_check_ckpt(struct f2fs_sb_info *sbi) +int sanity_check_ckpt(struct f2fs_sb_info *sbi) { unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned int main_segs, blocks_per_seg; - int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1091,20 +1438,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; - main_segs = le32_to_cpu(raw_super->segment_count_main); - blocks_per_seg = sbi->blocks_per_seg; - - for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { - if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || - le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) - return 1; - } - for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { - if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || - le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) - return 1; - } - if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1115,7 +1448,6 @@ static int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1135,111 +1467,131 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->cur_victim_sec = NULL_SECNO; sbi->max_victim_search = DEF_MAX_VICTIM_SEARCH; - for (i = 0; i < NR_COUNT_TYPE; i++) - atomic_set(&sbi->nr_pages[i], 0); - sbi->dir_level = DEF_DIR_LEVEL; - sbi->cp_interval = DEF_CP_INTERVAL; + sbi->interval_time[CP_TIME] = DEF_CP_INTERVAL; + sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); + mutex_init(&sbi->wio_mutex[NODE]); + mutex_init(&sbi->wio_mutex[DATA]); + spin_lock_init(&sbi->cp_lock); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION + memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, + F2FS_KEY_DESC_PREFIX_SIZE); + sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; +#endif +} + +static int init_percpu_info(struct f2fs_sb_info *sbi) +{ + int i, err; + + for (i = 0; i < NR_COUNT_TYPE; i++) { + err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); + if (err) + return err; + } + + err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); + if (err) + return err; + + return percpu_counter_init(&sbi->total_valid_inode_count, 0, + GFP_KERNEL); } /* * Read f2fs raw super block. - * Because we have two copies of super block, so read the first one at first, - * if the first one is invalid, move to read the second one. + * Because we have two copies of super block, so read both of them + * to get the first valid one. If any one of them is broken, we pass + * them recovery flag back to the caller. */ -static int read_raw_super_block(struct super_block *sb, +static int read_raw_super_block(struct f2fs_sb_info *sbi, struct f2fs_super_block **raw_super, - struct buffer_head **raw_super_buf, - int *recovery) + int *valid_super_block, int *recovery) { - int block = 0; - struct buffer_head *buffer; + struct super_block *sb = sbi->sb; + int block; + struct buffer_head *bh; struct f2fs_super_block *super; int err = 0; -retry: - buffer = sb_bread(sb, block); - if (!buffer) { - *recovery = 1; - f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", + super = kzalloc(sizeof(struct f2fs_super_block), GFP_KERNEL); + if (!super) + return -ENOMEM; + + for (block = 0; block < 2; block++) { + bh = sb_bread(sb, block); + if (!bh) { + f2fs_msg(sb, KERN_ERR, "Unable to read %dth superblock", block + 1); - if (block == 0) { - block++; - goto retry; - } else { err = -EIO; - goto out; + continue; } - } - super = (struct f2fs_super_block *) - ((char *)(buffer)->b_data + F2FS_SUPER_OFFSET); - - /* sanity checking of raw super */ - if (sanity_check_raw_super(sb, super)) { - brelse(buffer); - *recovery = 1; - f2fs_msg(sb, KERN_ERR, - "Can't find valid F2FS filesystem in %dth superblock", - block + 1); - if (block == 0) { - block++; - goto retry; - } else { + /* sanity checking of raw super */ + if (sanity_check_raw_super(sbi, bh)) { + f2fs_msg(sb, KERN_ERR, + "Can't find valid F2FS filesystem in %dth superblock", + block + 1); err = -EINVAL; - goto out; + brelse(bh); + continue; } + + if (!*raw_super) { + memcpy(super, bh->b_data + F2FS_SUPER_OFFSET, + sizeof(*super)); + *valid_super_block = block; + *raw_super = super; + } + brelse(bh); } - if (!*raw_super) { - *raw_super_buf = buffer; - *raw_super = super; - } else { - /* already have a valid superblock */ - brelse(buffer); - } + /* Fail to read any one of the superblocks*/ + if (err < 0) + *recovery = 1; - /* check the validity of the second superblock */ - if (block == 0) { - block++; - goto retry; - } - -out: /* No valid superblock */ if (!*raw_super) - return err; + kfree(super); + else + err = 0; - return 0; + return err; } int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) { - struct buffer_head *sbh = sbi->raw_super_buf; - sector_t block = sbh->b_blocknr; + struct buffer_head *bh; int err; - /* write back-up superblock first */ - sbh->b_blocknr = block ? 0 : 1; - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); + if ((recover && f2fs_readonly(sbi->sb)) || + bdev_read_only(sbi->sb->s_bdev)) { + set_sbi_flag(sbi, SBI_NEED_SB_WRITE); + return -EROFS; + } - sbh->b_blocknr = block; + /* write back-up superblock first */ + bh = sb_getblk(sbi->sb, sbi->valid_super_block ? 0: 1); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); /* if we are in recovery path, skip writing valid superblock */ if (recover || err) - goto out; + return err; /* write current valid superblock */ - mark_buffer_dirty(sbh); - err = sync_dirty_buffer(sbh); -out: - clear_buffer_write_io_error(sbh); - set_buffer_uptodate(sbh); + bh = sb_getblk(sbi->sb, sbi->valid_super_block); + if (!bh) + return -EIO; + err = __f2fs_commit_super(bh, F2FS_RAW_SUPER(sbi)); + brelse(bh); return err; } @@ -1247,17 +1599,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; struct f2fs_super_block *raw_super; - struct buffer_head *raw_super_buf; struct inode *root; - long err; + int err; bool retry = true, need_fsck = false; char *options = NULL; - int recovery, i; + int recovery, i, valid_super_block; + struct curseg_info *seg_i; try_onemore: err = -EINVAL; raw_super = NULL; - raw_super_buf = NULL; + valid_super_block = -1; recovery = 0; /* allocate memory for f2fs-specific super block info */ @@ -1265,17 +1617,31 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (!sbi) return -ENOMEM; + sbi->sb = sb; + + /* Load the checksum driver */ + sbi->s_chksum_driver = crypto_alloc_shash("crc32", 0, 0); + if (IS_ERR(sbi->s_chksum_driver)) { + f2fs_msg(sb, KERN_ERR, "Cannot load crc32 driver."); + err = PTR_ERR(sbi->s_chksum_driver); + sbi->s_chksum_driver = NULL; + goto free_sbi; + } + /* set a block size */ if (unlikely(!sb_set_blocksize(sb, F2FS_BLKSIZE))) { f2fs_msg(sb, KERN_ERR, "unable to set blocksize"); goto free_sbi; } - err = read_raw_super_block(sb, &raw_super, &raw_super_buf, &recovery); + err = read_raw_super_block(sbi, &raw_super, &valid_super_block, + &recovery); if (err) goto free_sbi; sb->s_fs_info = sbi; + sbi->raw_super = raw_super; + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1288,11 +1654,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; - sb->s_maxbytes = max_file_size(le32_to_cpu(raw_super->log_blocksize)); + sbi->max_file_blocks = max_file_blocks(); + sb->s_maxbytes = sbi->max_file_blocks << + le32_to_cpu(raw_super->log_blocksize); sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); sb->s_op = &f2fs_sops; + sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; sb->s_export_op = &f2fs_export_ops; sb->s_magic = F2FS_SUPER_MAGIC; @@ -1302,11 +1671,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) memcpy(sb->s_uuid, raw_super->uuid, sizeof(raw_super->uuid)); /* init f2fs-specific super block info */ - sbi->sb = sb; - sbi->raw_super = raw_super; - sbi->raw_super_buf = raw_super_buf; + sbi->valid_super_block = valid_super_block; mutex_init(&sbi->gc_mutex); - mutex_init(&sbi->writepages); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); @@ -1327,6 +1693,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_waitqueue_head(&sbi->cp_wait); init_sb_info(sbi); + err = init_percpu_info(sbi); + if (err) + goto free_options; + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { @@ -1341,24 +1711,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_meta_inode; } - /* sanity checking of checkpoint */ - err = -EINVAL; - if (sanity_check_ckpt(sbi)) { - f2fs_msg(sb, KERN_ERR, "Invalid F2FS checkpoint"); - goto free_cp; - } - sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); - sbi->total_valid_inode_count = - le32_to_cpu(sbi->ckpt->valid_inode_count); + percpu_counter_set(&sbi->total_valid_inode_count, + le32_to_cpu(sbi->ckpt->valid_inode_count)); sbi->user_block_count = le64_to_cpu(sbi->ckpt->user_block_count); sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; - sbi->alloc_valid_block_count = 0; - INIT_LIST_HEAD(&sbi->dir_inode_list); - spin_lock_init(&sbi->dir_inode_lock); + + for (i = 0; i < NR_INODE_TYPE; i++) { + INIT_LIST_HEAD(&sbi->inode_list[i]); + spin_lock_init(&sbi->inode_lock[i]); + } init_extent_cache_info(sbi); @@ -1378,6 +1743,17 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_nm; } + /* For write statistics */ + if (sb->s_bdev->bd_part) + sbi->sectors_written_start = + (u64)part_stat_read(sb->s_bdev->bd_part, sectors[1]); + + /* Read accumulated write IO statistics if exists */ + seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); + if (__exist_node_summaries(sbi)) + sbi->kbytes_written = + le64_to_cpu(seg_i->journal->info.kbytes_written); + build_gc_manager(sbi); /* get an inode for node space */ @@ -1421,9 +1797,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (f2fs_proc_root) sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - if (sbi->s_proc) + if (sbi->s_proc) { proc_create_data("segment_info", S_IRUGO, sbi->s_proc, &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + } sbi->s_kobj.kset = f2fs_kset; init_completion(&sbi->s_kobj_unregister); @@ -1439,7 +1818,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) * previous checkpoint was not done by clean system shutdown. */ if (bdev_read_only(sb->s_bdev) && - !is_set_ckpt_flags(sbi->ckpt, CP_UMOUNT_FLAG)) { + !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; goto free_kobj; } @@ -1447,14 +1826,27 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (need_fsck) set_sbi_flag(sbi, SBI_NEED_FSCK); - err = recover_fsync_data(sbi); - if (err) { + if (!retry) + goto skip_recovery; + + err = recover_fsync_data(sbi, false); + if (err < 0) { need_fsck = true; f2fs_msg(sb, KERN_ERR, - "Cannot recover all fsync data errno=%ld", err); + "Cannot recover all fsync data errno=%d", err); + goto free_kobj; + } + } else { + err = recover_fsync_data(sbi, true); + + if (!f2fs_readonly(sb) && err > 0) { + err = -EINVAL; + f2fs_msg(sb, KERN_ERR, + "Need to recover fsync data"); goto free_kobj; } } +skip_recovery: /* recover_fsync_data() cleared this already */ clear_sbi_flag(sbi, SBI_POR_DOING); @@ -1471,20 +1863,26 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) kfree(options); /* recover broken superblock */ - if (recovery && !f2fs_readonly(sb) && !bdev_read_only(sb->s_bdev)) { - f2fs_msg(sb, KERN_INFO, "Recover invalid superblock"); - f2fs_commit_super(sbi, true); + if (recovery) { + err = f2fs_commit_super(sbi, true); + f2fs_msg(sb, KERN_INFO, + "Try to recover %dth superblock, ret: %d", + sbi->valid_super_block ? 1 : 2, err); } - sbi->cp_expires = round_jiffies_up(jiffies); - + f2fs_update_time(sbi, CP_TIME); + f2fs_update_time(sbi, REQ_TIME); return 0; free_kobj: + f2fs_sync_inode_meta(sbi); kobject_del(&sbi->s_kobj); + kobject_put(&sbi->s_kobj); + wait_for_completion(&sbi->s_kobj_unregister); free_proc: if (sbi->s_proc) { remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); remove_proc_entry(sb->s_id, f2fs_proc_root); } f2fs_destroy_stats(sbi); @@ -1492,7 +1890,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) dput(sb->s_root); sb->s_root = NULL; free_node_inode: + truncate_inode_pages_final(NODE_MAPPING(sbi)); mutex_lock(&sbi->umount_mutex); + release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); @@ -1500,16 +1900,18 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -free_cp: kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); free_options: + destroy_percpu_info(sbi); kfree(options); free_sb_buf: - brelse(raw_super_buf); + kfree(raw_super); free_sbi: + if (sbi->s_chksum_driver) + crypto_free_shash(sbi->s_chksum_driver); kfree(sbi); /* give only one another chance */ @@ -1545,8 +1947,9 @@ MODULE_ALIAS_FS("f2fs"); static int __init init_inodecache(void) { - f2fs_inode_cachep = f2fs_kmem_cache_create("f2fs_inode_cache", - sizeof(struct f2fs_inode_info)); + f2fs_inode_cachep = kmem_cache_create("f2fs_inode_cache", + sizeof(struct f2fs_inode_info), 0, + SLAB_RECLAIM_ACCOUNT, NULL); if (!f2fs_inode_cachep) return -ENOMEM; return 0; @@ -1588,25 +1991,23 @@ static int __init init_f2fs_fs(void) err = -ENOMEM; goto free_extent_cache; } - err = f2fs_init_crypto(); - if (err) - goto free_kset; - err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_crypto; + goto free_kset; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; - f2fs_create_root_stats(); + err = f2fs_create_root_stats(); + if (err) + goto free_filesystem; f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; +free_filesystem: + unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_crypto: - f2fs_exit_crypto(); free_kset: kset_unregister(f2fs_kset); free_extent_cache: @@ -1627,15 +2028,14 @@ static void __exit exit_f2fs_fs(void) { remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); - unregister_shrinker(&f2fs_shrinker_info); unregister_filesystem(&f2fs_fs_type); - f2fs_exit_crypto(); + unregister_shrinker(&f2fs_shrinker_info); + kset_unregister(f2fs_kset); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); destroy_node_manager_caches(); destroy_inodecache(); - kset_unregister(f2fs_kset); f2fs_destroy_trace_ios(); } diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 145fb659ad44..562ce0821559 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -29,7 +29,8 @@ static inline void __print_last_io(void) last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, last_io.fio.blk_addr, + last_io.fio.rw, + last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); } @@ -101,7 +102,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) last_io.pid == pid && last_io.type == __file_type(inode, pid) && last_io.fio.rw == fio->rw && - last_io.fio.blk_addr + last_io.len == fio->blk_addr) { + last_io.fio.new_blkaddr + last_io.len == + fio->new_blkaddr) { last_io.len++; return; } diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 862368a32e53..69c6bb9cf207 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - mark_inode_dirty(inode); + f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -264,18 +264,20 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } -static void *read_all_xattrs(struct inode *inode, struct page *ipage) +static int read_all_xattrs(struct inode *inode, struct page *ipage, + void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; size_t size = PAGE_SIZE, inline_size = 0; void *txattr_addr; + int err; inline_size = inline_xattr_size(inode); txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); if (!txattr_addr) - return NULL; + return -ENOMEM; /* read from inline xattr */ if (inline_size) { @@ -286,8 +288,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) inline_addr = inline_xattr_addr(ipage); } else { page = get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) + if (IS_ERR(page)) { + err = PTR_ERR(page); goto fail; + } inline_addr = inline_xattr_addr(page); } memcpy(txattr_addr, inline_addr, inline_size); @@ -301,8 +305,10 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) /* The inode already has an extended attribute block. */ xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); goto fail; + } xattr_addr = page_address(xpage); memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); @@ -316,10 +322,11 @@ static void *read_all_xattrs(struct inode *inode, struct page *ipage) header->h_magic = cpu_to_le32(F2FS_XATTR_MAGIC); header->h_refcount = cpu_to_le32(1); } - return txattr_addr; + *base_addr = txattr_addr; + return 0; fail: kzfree(txattr_addr); - return NULL; + return err; } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, @@ -345,7 +352,8 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(ipage); - f2fs_wait_on_page_writeback(ipage, NODE); + f2fs_wait_on_page_writeback(ipage, NODE, true); + set_page_dirty(ipage); } else { page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -353,7 +361,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(page); } inline_addr = inline_xattr_addr(page); - f2fs_wait_on_page_writeback(page, NODE); + f2fs_wait_on_page_writeback(page, NODE, true); } memcpy(inline_addr, txattr_addr, inline_size); f2fs_put_page(page, 1); @@ -374,7 +382,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, return PTR_ERR(xpage); } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE); + f2fs_wait_on_page_writeback(xpage, NODE, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); @@ -412,9 +420,9 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; entry = __find_xattr(base_addr, index, len, name); if (IS_XATTR_LAST_ENTRY(entry)) { @@ -448,9 +456,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; - base_addr = read_all_xattrs(inode, NULL); - if (!base_addr) - return -ENOMEM; + error = read_all_xattrs(inode, NULL, &base_addr); + if (error) + return error; list_for_each_xattr(entry, base_addr) { const struct xattr_handler *handler = @@ -481,13 +489,12 @@ static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) { - struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_xattr_entry *here, *last; void *base_addr; int found, newsize; size_t len; __u32 new_hsize; - int error = -ENOMEM; + int error = 0; if (name == NULL) return -EINVAL; @@ -503,9 +510,9 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; - base_addr = read_all_xattrs(inode, ipage); - if (!base_addr) - goto exit; + error = read_all_xattrs(inode, ipage, &base_addr); + if (error) + return error; /* find entry with wanted name. */ here = __find_xattr(base_addr, index, len, name); @@ -538,7 +545,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, free = free + ENTRY_SIZE(here); if (unlikely(free < newsize)) { - error = -ENOSPC; + error = -E2BIG; goto exit; } } @@ -566,7 +573,6 @@ static int __f2fs_setxattr(struct inode *inode, int index, * Before we come here, old entry is removed. * We just write new entry. */ - memset(last, 0, newsize); last->e_name_index = index; last->e_name_len = len; memcpy(last->e_name, name, len); @@ -580,19 +586,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (error) goto exit; - if (is_inode_flag_set(fi, FI_ACL_MODE)) { - inode->i_mode = fi->i_acl_mode; + if (is_inode_flag_set(inode, FI_ACL_MODE)) { + inode->i_mode = F2FS_I(inode)->i_acl_mode; inode->i_ctime = CURRENT_TIME; - clear_inode_flag(fi, FI_ACL_MODE); + clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - - if (ipage) - update_inode(inode, ipage); - else - update_inode_page(inode); + f2fs_mark_inode_dirty_sync(inode); + if (!error && S_ISDIR(inode->i_mode)) + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: kzfree(base_addr); return error; @@ -609,7 +613,7 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, if (ipage) return __f2fs_setxattr(inode, index, name, value, size, ipage, flags); - f2fs_balance_fs(sbi); + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); /* protect xattr_ver */ @@ -618,5 +622,6 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); + f2fs_update_time(sbi, REQ_TIME); return err; } diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index 71a7100d5492..d2fd0387a3c7 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -126,7 +126,8 @@ extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); #define f2fs_xattr_handlers NULL static inline int f2fs_setxattr(struct inode *inode, int index, - const char *name, const void *value, size_t size, int flags) + const char *name, const void *value, size_t size, + struct page *page, int flags) { return -EOPNOTSUPP; } diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 0acbb85ff9ff..51b47269e2fe 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -229,6 +229,7 @@ struct dentry_operations { #define DCACHE_MAY_FREE 0x00800000 #define DCACHE_FALLTHRU 0x01000000 /* Fall through to lower layer */ #define DCACHE_OP_SELECT_INODE 0x02000000 /* Unioned entry: dcache op selects inode */ +#define DCACHE_ENCRYPTED_WITH_KEY 0x04000000 /* dir is encrypted with a valid key */ #define DCACHE_OP_REAL 0x08000000 extern seqlock_t rename_lock; diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3d6e6ce44c5c..e46e7d10312b 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -21,7 +21,7 @@ #define F2FS_BLKSIZE 4096 /* support only 4KB block */ #define F2FS_BLKSIZE_BITS 12 /* bits for F2FS_BLKSIZE */ #define F2FS_MAX_EXTENSION 64 /* # of extension entries */ -#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) / F2FS_BLKSIZE) +#define F2FS_BLK_ALIGN(x) (((x) + F2FS_BLKSIZE - 1) >> F2FS_BLKSIZE_BITS) #define NULL_ADDR ((block_t)0) /* used as block_t addresses */ #define NEW_ADDR ((block_t)-1) /* used as block_t addresses */ @@ -51,6 +51,7 @@ #define MAX_ACTIVE_DATA_LOGS 8 #define VERSION_LEN 256 +#define MAX_VOLUME_NAME 512 /* * For superblock @@ -84,7 +85,7 @@ struct f2fs_super_block { __le32 node_ino; /* node inode number */ __le32 meta_ino; /* meta inode number */ __u8 uuid[16]; /* 128-bit uuid for volume */ - __le16 volume_name[512]; /* volume name */ + __le16 volume_name[MAX_VOLUME_NAME]; /* volume name */ __le32 extension_count; /* # of extensions below */ __u8 extension_list[F2FS_MAX_EXTENSION][8]; /* extension array */ __le32 cp_payload; @@ -99,6 +100,7 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 #define CP_ERROR_FLAG 0x00000008 @@ -169,12 +171,12 @@ struct f2fs_extent { #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ -#define ADDRS_PER_INODE(fi) addrs_per_inode(fi) +#define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ #define NIDS_PER_BLOCK 1018 /* Node IDs in an Indirect Block */ -#define ADDRS_PER_PAGE(page, fi) \ - (IS_INODE(page) ? ADDRS_PER_INODE(fi) : ADDRS_PER_BLOCK) +#define ADDRS_PER_PAGE(page, inode) \ + (IS_INODE(page) ? ADDRS_PER_INODE(inode) : ADDRS_PER_BLOCK) #define NODE_DIR1_BLOCK (DEF_ADDRS_PER_INODE + 1) #define NODE_DIR2_BLOCK (DEF_ADDRS_PER_INODE + 2) @@ -261,7 +263,7 @@ struct f2fs_node { /* * For NAT entries */ -#define NAT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -281,7 +283,7 @@ struct f2fs_nat_block { * Not allow to change this. */ #define SIT_VBLOCK_MAP_SIZE 64 -#define SIT_ENTRY_PER_BLOCK (PAGE_CACHE_SIZE / sizeof(struct f2fs_sit_entry)) +#define SIT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_sit_entry)) /* * F2FS uses 4 bytes to represent block address. As a result, supported size of @@ -350,7 +352,7 @@ struct f2fs_summary { struct summary_footer { unsigned char entry_type; /* SUM_TYPE_XXX */ - __u32 check_sum; /* summary checksum */ + __le32 check_sum; /* summary checksum */ } __packed; #define SUM_JOURNAL_SIZE (F2FS_BLKSIZE - SUM_FOOTER_SIZE -\ @@ -363,6 +365,12 @@ struct summary_footer { sizeof(struct sit_journal_entry)) #define SIT_JOURNAL_RESERVED ((SUM_JOURNAL_SIZE - 2) %\ sizeof(struct sit_journal_entry)) + +/* Reserved area should make size of f2fs_extra_info equals to + * that of nat_journal and sit_journal. + */ +#define EXTRA_INFO_RESERVED (SUM_JOURNAL_SIZE - 2 - 8) + /* * frequently updated NAT/SIT entries can be stored in the spare area in * summary blocks @@ -392,18 +400,28 @@ struct sit_journal { __u8 reserved[SIT_JOURNAL_RESERVED]; } __packed; -/* 4KB-sized summary block structure */ -struct f2fs_summary_block { - struct f2fs_summary entries[ENTRIES_IN_SUM]; +struct f2fs_extra_info { + __le64 kbytes_written; + __u8 reserved[EXTRA_INFO_RESERVED]; +} __packed; + +struct f2fs_journal { union { __le16 n_nats; __le16 n_sits; }; - /* spare area is used by NAT or SIT journals */ + /* spare area is used by NAT or SIT journals or extra info */ union { struct nat_journal nat_j; struct sit_journal sit_j; + struct f2fs_extra_info info; }; +} __packed; + +/* 4KB-sized summary block structure */ +struct f2fs_summary_block { + struct f2fs_summary entries[ENTRIES_IN_SUM]; + struct f2fs_journal journal; struct summary_footer footer; } __packed; @@ -497,4 +515,6 @@ enum { F2FS_FT_MAX }; +#define S_SHIFT 12 + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fs.h b/include/linux/fs.h index bff4ce57b77e..5b79adb9782e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -52,6 +52,8 @@ struct swap_info_struct; struct seq_file; struct workqueue_struct; struct iov_iter; +struct fscrypt_info; +struct fscrypt_operations; extern void __init inode_init(void); extern void __init inode_init_early(void); @@ -677,6 +679,9 @@ struct inode { struct hlist_head i_fsnotify_marks; #endif +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + struct fscrypt_info *i_crypt_info; +#endif void *i_private; /* fs or device private pointer */ }; @@ -1337,6 +1342,8 @@ struct super_block { #endif const struct xattr_handler **s_xattr; + const struct fscrypt_operations *s_cop; + struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h new file mode 100644 index 000000000000..76cff18bb032 --- /dev/null +++ b/include/linux/fscrypto.h @@ -0,0 +1,435 @@ +/* + * General per-file encryption definition + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPTO_H +#define _LINUX_FSCRYPTO_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +/* Encryption parameters */ +#define FS_XTS_TWEAK_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 +#define FS_MAX_KEY_SIZE 64 + +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* This is passed in from userspace into the kernel keyring */ +struct fscrypt_key { + u32 mode; + u8 raw[FS_MAX_KEY_SIZE]; + u32 size; +} __packed; + +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct key *ci_keyring_key; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_WRITE_PATH_FL 0x00000002 + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ + u8 mode; /* Encryption mode for tfm */ +}; + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER((ecr).completion), 0 } + +static inline int fscrypt_key_size(int mode) +{ + switch (mode) { + case FS_ENCRYPTION_MODE_AES_256_XTS: + return FS_AES_256_XTS_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_GCM: + return FS_AES_256_GCM_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CBC: + return FS_AES_256_CBC_KEY_SIZE; + case FS_ENCRYPTION_MODE_AES_256_CTS: + return FS_AES_256_CTS_KEY_SIZE; + default: + BUG(); + } + return 0; +} + +#define FS_FNAME_NUM_SCATTER_ENTRIES 4 +#define FS_CRYPTO_BLOCK_SIZE 16 +#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +/** + * This function is used to calculate the disk space required to + * store a filename of length l in encrypted symlink format. + */ +static inline u32 fscrypt_symlink_data_len(u32 l) +{ + if (l < FS_CRYPTO_BLOCK_SIZE) + l = FS_CRYPTO_BLOCK_SIZE; + return (l + sizeof(struct fscrypt_symlink_data) - 1); +} + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + int (*get_context)(struct inode *, void *, size_t); + int (*key_prefix)(struct inode *, u8 **); + int (*prepare_context)(struct inode *); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_contents_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); +} + +static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) +{ + return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); +} + +static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) +{ + if (size == fscrypt_key_size(mode)) + return size; + return 0; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +extern const struct dentry_operations fscrypt_d_ops; +#endif + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + d_set_d_op(dentry, &fscrypt_d_ops); +#endif +} + +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +int fscrypt_initialize(void); + +extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); +extern int fscrypt_decrypt_page(struct page *); +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern void fscrypt_restore_control_page(struct page *); +extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, + unsigned int); +/* policy.c */ +extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); +extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int get_crypt_info(struct inode *); +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); +extern void fscrypt_free_filename(struct fscrypt_name *); +extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); +#endif + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, + gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) +{ + return; +} + +static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, + struct page *p, gfp_t f) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_notsupp_decrypt_page(struct page *p) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c, + struct bio *b) +{ + return; +} + +static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b) +{ + return; +} + +static inline void fscrypt_notsupp_restore_control_page(struct page *p) +{ + return; +} + +static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, + sector_t s, unsigned int f) +{ + return -EOPNOTSUPP; +} + +/* policy.c */ +static inline int fscrypt_notsupp_process_policy(struct file *f, + const struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_get_policy(struct inode *i, + struct fscrypt_policy *p) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_has_permitted_context(struct inode *p, + struct inode *i) +{ + return 0; +} + +static inline int fscrypt_notsupp_inherit_context(struct inode *p, + struct inode *i, void *v, bool b) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_notsupp_get_encryption_info(struct inode *i) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_put_encryption_info(struct inode *i, + struct fscrypt_info *f) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_notsupp_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode, + u32 ilen, struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c) +{ + return; +} + +static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} +#endif /* _LINUX_FSCRYPTO_H */ diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 00b4a6308249..3a09bb4dc3b2 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -52,6 +52,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -693,28 +694,32 @@ TRACE_EVENT(f2fs_direct_IO_exit, __entry->ret) ); -TRACE_EVENT(f2fs_reserve_new_block, +TRACE_EVENT(f2fs_reserve_new_blocks, - TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node), + TP_PROTO(struct inode *inode, nid_t nid, unsigned int ofs_in_node, + blkcnt_t count), - TP_ARGS(inode, nid, ofs_in_node), + TP_ARGS(inode, nid, ofs_in_node, count), TP_STRUCT__entry( __field(dev_t, dev) __field(nid_t, nid) __field(unsigned int, ofs_in_node) + __field(blkcnt_t, count) ), TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = nid; __entry->ofs_in_node = ofs_in_node; + __entry->count = count; ), - TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u", + TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", show_dev(__entry), (unsigned int)__entry->nid, - __entry->ofs_in_node) + __entry->ofs_in_node, + (unsigned long long)__entry->count) ); DECLARE_EVENT_CLASS(f2fs__submit_page_bio, @@ -727,7 +732,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(dev_t, dev) __field(ino_t, ino) __field(pgoff_t, index) - __field(block_t, blkaddr) + __field(block_t, old_blkaddr) + __field(block_t, new_blkaddr) __field(int, rw) __field(int, type) ), @@ -736,16 +742,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->dev = page->mapping->host->i_sb->s_dev; __entry->ino = page->mapping->host->i_ino; __entry->index = page->index; - __entry->blkaddr = fio->blk_addr; + __entry->old_blkaddr = fio->old_blkaddr; + __entry->new_blkaddr = fio->new_blkaddr; __entry->rw = fio->rw; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "blkaddr = 0x%llx, rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", show_dev_ino(__entry), (unsigned long)__entry->index, - (unsigned long long)__entry->blkaddr, + (unsigned long long)__entry->old_blkaddr, + (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->rw), show_block_type(__entry->type)) ); @@ -1265,6 +1273,44 @@ TRACE_EVENT(f2fs_destroy_extent_tree, __entry->node_cnt) ); +DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, type) + __field(s64, count) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->type = type; + __entry->count = count; + ), + + TP_printk("dev = (%d,%d), %s, dirty count = %lld", + show_dev(__entry), + show_file_type(__entry->type), + __entry->count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_enter, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + +DEFINE_EVENT(f2fs_sync_dirty_inodes, f2fs_sync_dirty_inodes_exit, + + TP_PROTO(struct super_block *sb, int type, s64 count), + + TP_ARGS(sb, type, count) +); + #endif /* _TRACE_F2FS_H */ /* This part must be outside protection */ diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index cd0c1a1a9ccf..15048097910f 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -172,6 +172,24 @@ struct inodes_stat_t { #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) +/* + * File system encryption support + */ +/* Policy provided via an ioctl on the topmost directory */ +#define FS_KEY_DESCRIPTOR_SIZE 8 + +struct fscrypt_policy { + __u8 version; + __u8 contents_encryption_mode; + __u8 filenames_encryption_mode; + __u8 flags; + __u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; +} __packed; + +#define FS_IOC_SET_ENCRYPTION_POLICY _IOR('f', 19, struct fscrypt_policy) +#define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) +#define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ From 7f1e792457bfaa43caeb7d0ccdc54fcf256e6655 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 13:38:41 -0700 Subject: [PATCH 403/546] f2fs: fix wrong sum_page pointer in f2fs_gc commit de0dcc40f6e24d6bac6b60e36eac4659bbbd3f00 upstream. This patch fixes using a wrong pointer for sum_page in f2fs_gc. Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 0a0a1ad1fe1f..4336807cc690 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -848,16 +848,16 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, for (segno = start_segno; segno < end_segno; segno++) { - if (get_valid_blocks(sbi, segno, 1) == 0 || - unlikely(f2fs_cp_error(sbi))) - goto next; - /* find segment summary of victim */ sum_page = find_get_page(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_bug_on(sbi, !PageUptodate(sum_page)); f2fs_put_page(sum_page, 0); + if (get_valid_blocks(sbi, segno, 1) == 0 || + !PageUptodate(sum_page) || + unlikely(f2fs_cp_error(sbi))) + goto next; + sum = page_address(sum_page); f2fs_bug_on(sbi, type != GET_SUM_TYPE((&sum->footer))); From 819568ea8e0d6180f2d3a47759711f7324560b2c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 22 Nov 2016 14:06:03 -0800 Subject: [PATCH 404/546] posix_acl: Clear SGID bit when setting file permissions commit 073931017b49d9458aa351605b43a7e34598caef upstream. Cherry-pick to f2fs only for generic/375 from: (073931017: posix_acl: Clear SGID bit when setting file permissions) Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index fb0744b94c2f..4a34040932e9 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -215,12 +215,10 @@ static int __f2fs_set_acl(struct inode *inode, int type, case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; if (acl) { - error = posix_acl_equiv_mode(acl, &inode->i_mode); - if (error < 0) + error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (error) return error; set_acl_inode(inode, inode->i_mode); - if (error == 0) - acl = NULL; } break; From ac37cd50a26910b83b19249f444e480823461a42 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 23 Nov 2016 10:51:17 -0800 Subject: [PATCH 405/546] f2fs: fix overflow due to condition check order commit e87f7329bbd6760c2acc4f1eb423362b08851a71 upstream. In the last ilen case, i was already increased, resulting in accessing out- of-boundary entry of do_replace and blkaddr. Fix to check ilen first to exit the loop. Fixes: 2aa8fbb9693020 ("f2fs: refactor __exchange_data_block for speed up") Cc: stable@vger.kernel.org # 4.8+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c6e33258fabf..5c4ea4cf2fb1 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -971,7 +971,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, new_size = (dst + i) << PAGE_SHIFT; if (dst_inode->i_size < new_size) f2fs_i_size_write(dst_inode, new_size); - } while ((do_replace[i] || blkaddr[i] == NULL_ADDR) && --ilen); + } while (--ilen && (do_replace[i] || blkaddr[i] == NULL_ADDR)); f2fs_put_dnode(&dn); } else { From 0184909c6653b2cb87d760a2f99a2105d612b98d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 24 Nov 2016 12:45:15 -0800 Subject: [PATCH 406/546] f2fs: fix to determine start_cp_addr by sbi->cur_cp_pack commit 8508e44ae98622f841f5ef29d0bf3d5db4e0c1cc upstream. We don't guarantee cp_addr is fixed by cp_version. This is to sync with f2fs-tools. Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 8 +++++++- fs/f2fs/f2fs.h | 28 +++++++++++++++++----------- 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cb23d6cf676b..1608ae8eea97 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -770,6 +770,11 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) if (sanity_check_ckpt(sbi)) goto fail_no_cp; + if (cur_page == cp1) + sbi->cur_cp_pack = 1; + else + sbi->cur_cp_pack = 2; + if (cp_blks <= 1) goto done; @@ -1121,7 +1126,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); - start_blk = __start_cp_addr(sbi); + start_blk = __start_cp_next_addr(sbi); /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); @@ -1185,6 +1190,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); + __set_cp_next_pack(sbi); /* * redirty superblock if metadata like node page or inode cache is diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index af293e84e5cd..45d1e4522760 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -789,6 +789,7 @@ struct f2fs_sb_info { /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ + int cur_cp_pack; /* remain current cp pack */ spinlock_t cp_lock; /* for flag in ckpt */ struct inode *meta_inode; /* cache meta blocks */ struct mutex cp_mutex; /* checkpoint procedure lock */ @@ -1354,22 +1355,27 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) static inline block_t __start_cp_addr(struct f2fs_sb_info *sbi) { - block_t start_addr; - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); - unsigned long long ckpt_version = cur_cp_version(ckpt); + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); - - /* - * odd numbered checkpoint should at cp segment 0 - * and even segment must be at cp segment 1 - */ - if (!(ckpt_version & 1)) + if (sbi->cur_cp_pack == 2) start_addr += sbi->blocks_per_seg; - return start_addr; } +static inline block_t __start_cp_next_addr(struct f2fs_sb_info *sbi) +{ + block_t start_addr = le32_to_cpu(F2FS_RAW_SUPER(sbi)->cp_blkaddr); + + if (sbi->cur_cp_pack == 1) + start_addr += sbi->blocks_per_seg; + return start_addr; +} + +static inline void __set_cp_next_pack(struct f2fs_sb_info *sbi) +{ + sbi->cur_cp_pack = (sbi->cur_cp_pack == 1) ? 2 : 1; +} + static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) { return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); From bb7a2718a2a1f33b63016397ab56faa876470afe Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:34 +0800 Subject: [PATCH 407/546] f2fs: exclude free nids building and allocation commit 2411cf5befa5804e4ced4c45a3212d7653869286 upstream. During nid allocation, it needs to exclude building and allocating flow of free nids, this is because while building free nid cache, there are two steps: a) load free nids from unused nat entries in NAT pages, b) update free nid cache by checking nat journal. The two steps should be atomical, otherwise an used nid can be allocated as free one after a) and before b). This patch adds missing lock which covers build_free_nids in unlock_operation and f2fs_balance_fs_bg to avoid that. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b1e615ed2bef..a8c2bd3e5029 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1786,7 +1786,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1840,6 +1840,13 @@ void build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } +void build_free_nids(struct f2fs_sb_info *sbi) +{ + mutex_lock(&NM_I(sbi)->build_lock); + __build_free_nids(sbi); + mutex_unlock(&NM_I(sbi)->build_lock); +} + /* * If this function returns success, caller can obtain a new nid * from second parameter of this function. @@ -1876,9 +1883,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->free_nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - mutex_lock(&nm_i->build_lock); build_free_nids(sbi); - mutex_unlock(&nm_i->build_lock); goto retry; } From da63891af3f34d925c99dad33d31da3cc9c56726 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:00 +0800 Subject: [PATCH 408/546] f2fs: fix to release discard entries during checkpoint commit 2dd15654ac0abe587a245a09a7823bbbd588bfb7 upstream. In f2fs_fill_super, if there is any IO error occurs during recovery, cached discard entries will be leaked, in order to avoid this, make write_checkpoint() handle memory release by itself, besides, move clear_prefree_segments to write_checkpoint for readability. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- fs/f2fs/super.c | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1608ae8eea97..63ca342a3cc8 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1187,7 +1187,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (unlikely(f2fs_cp_error(sbi))) return -EIO; - clear_prefree_segments(sbi, cpc); clear_sbi_flag(sbi, SBI_IS_DIRTY); clear_sbi_flag(sbi, SBI_NEED_CP); __set_cp_next_pack(sbi); @@ -1264,6 +1263,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); + if (err) + release_discard_addrs(sbi); + else + clear_prefree_segments(sbi, cpc); unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fd249cc9b96e..006138a6c5ab 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -738,7 +738,6 @@ static void f2fs_put_super(struct super_block *sb) * In addition, EIO will skip do checkpoint, we need this as well. */ release_ino_entry(sbi, true); - release_discard_addrs(sbi); f2fs_leave_shrinker(sbi); mutex_unlock(&sbi->umount_mutex); From 7849ddb368cbfb64d611e01738bbed3d99f4ac36 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:01 +0800 Subject: [PATCH 409/546] f2fs: give a chance to detach from dirty list commit 933439c8f3474e329709b715b43b0b8168bbecf8 upstream. If there is no dirty pages in inode, we should give a chance to detach the inode from global dirty list, otherwise it needs to call another unnecessary .writepages for detaching. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 8 +++++--- fs/f2fs/dir.c | 1 + fs/f2fs/gc.c | 4 +++- fs/f2fs/inline.c | 4 +++- fs/f2fs/node.c | 1 + fs/f2fs/segment.c | 4 +++- 6 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fc9bfd8f566d..4000ccb85a78 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1817,12 +1817,14 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, return; if (PageDirty(page)) { - if (inode->i_ino == F2FS_META_INO(sbi)) + if (inode->i_ino == F2FS_META_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_META); - else if (inode->i_ino == F2FS_NODE_INO(sbi)) + } else if (inode->i_ino == F2FS_NODE_INO(sbi)) { dec_page_count(sbi, F2FS_DIRTY_NODES); - else + } else { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } } /* This is atomic written page, keep Private */ diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e634a637c443..c0dba11519cf 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -742,6 +742,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, ClearPagePrivate(page); ClearPageUptodate(page); inode_dec_dirty_pages(dir); + remove_dirty_inode(dir); } f2fs_put_page(page, 1); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 4336807cc690..72a0ca08f901 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -670,8 +670,10 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) retry: set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } set_cold_data(page); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b150d03beec1..a1acbc67d827 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -154,8 +154,10 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) fio.old_blkaddr = dn->data_blkaddr; write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); - if (dirty) + if (dirty) { inode_dec_dirty_pages(dn->inode); + remove_dirty_inode(dn->inode); + } /* this converted inline_data should be recovered. */ set_inode_flag(dn->inode, FI_APPEND_WRITE); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index a8c2bd3e5029..97eb2c0811b5 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1203,6 +1203,7 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) ret = f2fs_write_inline_data(inode, page); inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); if (ret) set_page_dirty(page); page_out: diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b3c61ae37f92..75477ec6c535 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -272,8 +272,10 @@ static int __commit_inmem_pages(struct inode *inode, set_page_dirty(page); f2fs_wait_on_page_writeback(page, DATA, true); - if (clear_page_dirty_for_io(page)) + if (clear_page_dirty_for_io(page)) { inode_dec_dirty_pages(inode); + remove_dirty_inode(inode); + } fio.page = page; err = do_write_data_page(&fio); From 97033d8202f3eb6f373a7e815602199ab8c02ddc Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:02 +0800 Subject: [PATCH 410/546] f2fs: add missing f2fs_balance_fs in f2fs_zero_range commit 9434fcde1fa0f48e1a29fbdd9d436fa279aeb909 upstream. f2fs_balance_fs should be called in between node page updating, otherwise node page count will exceeded far beyond watermark of triggering foreground garbage collection, result in facing high risk of hitting LFS allocation failure. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c4ea4cf2fb1..c0774c98dce4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1222,6 +1222,9 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = f2fs_do_zero_range(&dn, index, end); f2fs_put_dnode(&dn); f2fs_unlock_op(sbi); + + f2fs_balance_fs(sbi, dn.node_changed); + if (ret) goto out; From 0b7e41e009bb17ea09c6353d822e6548ea34da01 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:03 +0800 Subject: [PATCH 411/546] f2fs: don't miss any f2fs_balance_fs cases commit 6f2d8ed654bfa391854df4de854953f772a16a9d upstream. In f2fs_map_blocks, let f2fs_balance_fs detects node page modification with dn.node_changed to avoid miss some corner cases. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4000ccb85a78..c390542917e4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -671,7 +671,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; struct extent_info ei; - bool allocated = false; block_t blkaddr; if (!maxblocks) @@ -730,10 +729,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } } else { err = __allocate_data_block(&dn); - if (!err) { + if (!err) set_inode_flag(inode, FI_APPEND_WRITE); - allocated = true; - } } if (err) goto sync_out; @@ -788,7 +785,6 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, err = reserve_new_blocks(&dn, prealloc); if (err) goto sync_out; - allocated = dn.node_changed; map->m_len += dn.ofs_in_node - ofs_in_node; if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { @@ -807,9 +803,8 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } - allocated = false; goto next_dnode; sync_out: @@ -817,7 +812,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, unlock_out: if (create) { f2fs_unlock_op(sbi); - f2fs_balance_fs(sbi, allocated); + f2fs_balance_fs(sbi, dn.node_changed); } out: trace_f2fs_map_blocks(inode, map, err); From 91391d1d48858163479705b50a9a129e29882d1d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:04 +0800 Subject: [PATCH 412/546] f2fs: be aware of extent beyond EOF in fiemap commit 58736fa60f6ae659ac72da8b1580c308b47e8edd upstream. f2fs can support fallocating blocks beyond file size without changing the size, but ->fiemap of f2fs was restricted and can't detect these extents fallocated past EOF, now relieve the restriction. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++-------------- 1 file changed, 4 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index c390542917e4..0cdabfce6c17 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -881,7 +881,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, struct buffer_head map_bh; sector_t start_blk, last_blk; pgoff_t next_pgofs; - loff_t isize; u64 logical = 0, phys = 0, size = 0; u32 flags = 0; int ret = 0; @@ -898,13 +897,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); - isize = i_size_read(inode); - if (start >= isize) - goto out; - - if (start + len > isize) - len = isize - start; - if (logical_to_blk(inode, len) == 0) len = blk_to_logical(inode, 1); @@ -923,13 +915,11 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, /* HOLE */ if (!buffer_mapped(&map_bh)) { start_blk = next_pgofs; - /* Go through holes util pass the EOF */ - if (blk_to_logical(inode, start_blk) < isize) + + if (blk_to_logical(inode, start_blk) < blk_to_logical(inode, + F2FS_I_SB(inode)->max_file_blocks)) goto prep_next; - /* Found a hole beyond isize means no more extents. - * Note that the premise is that filesystems don't - * punch holes beyond isize and keep size unchanged. - */ + flags |= FIEMAP_EXTENT_LAST; } From ef78c27020edf666dc21e36223338b73f05e027d Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:05 +0800 Subject: [PATCH 413/546] f2fs: fix to update largest extent under lock commit b691d98fdd4cc2514c60fd6975e6016da203e64f upstream. In order to avoid racing problem, make largest extent cache being updated under lock. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d7369895a78a..1fbebcb33a9d 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -252,6 +252,7 @@ struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino) int update_inode(struct inode *inode, struct page *node_page) { struct f2fs_inode *ri; + struct extent_tree *et = F2FS_I(inode)->extent_tree; f2fs_inode_synced(inode); @@ -267,11 +268,13 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_size = cpu_to_le64(i_size_read(inode)); ri->i_blocks = cpu_to_le64(inode->i_blocks); - if (F2FS_I(inode)->extent_tree) - set_raw_extent(&F2FS_I(inode)->extent_tree->largest, - &ri->i_ext); - else + if (et) { + read_lock(&et->lock); + set_raw_extent(&et->largest, &ri->i_ext); + read_unlock(&et->lock); + } else { memset(&ri->i_ext, 0, sizeof(ri->i_ext)); + } set_raw_inline(inode, ri); ri->i_atime = cpu_to_le64(inode->i_atime.tv_sec); From 920b3956b974e5056410271120e94ccc9acc283f Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:57:06 +0800 Subject: [PATCH 414/546] f2fs: fix error handling in fsync_node_pages commit 9de69279750e9740bc7221c7051a40c0516a58fb upstream. In fsync_node_pages, if f2fs was taged with CP_ERROR_FLAG, make sure bio cache was flushed before return. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 97eb2c0811b5..bc38e5a92b4b 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1338,7 +1338,8 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); pagevec_release(&pvec); - return -EIO; + ret = -EIO; + goto out; } if (!IS_DNODE(page) || !is_cold_node(page)) @@ -1411,7 +1412,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, unlock_page(last_page); goto retry; } - +out: if (nwritten) f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); return ret ? -EIO: 0; From b20aaaa35b9005fd3ce44daa5809f01db15b226c Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Tue, 11 Oct 2016 10:36:12 -0700 Subject: [PATCH 415/546] f2fs: fix sparse warnings commit 0c0b471e43e7acf0747c6eb410863bf78c14750d upstream. f2fs contained a number of endianness conversion bugs. Also, one function should have been 'static'. Found with sparse by running 'make C=2 CF=-D__CHECK_ENDIAN__ fs/f2fs/' Signed-off-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 2 +- fs/f2fs/inline.c | 2 +- fs/f2fs/node.c | 5 +++-- fs/f2fs/node.h | 2 +- 4 files changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index c0dba11519cf..7136dc1ade11 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -136,7 +136,7 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, /* show encrypted name */ if (fname->hash) { - if (de->hash_code == fname->hash) + if (de->hash_code == cpu_to_le32(fname->hash)) goto found; } else if (de_name.len == name->len && de->hash_code == namehash && diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index a1acbc67d827..56363c18489a 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -438,7 +438,7 @@ static int f2fs_add_inline_entries(struct inode *dir, } new_name.name = d.filename[bit_pos]; - new_name.len = de->name_len; + new_name.len = le16_to_cpu(de->name_len); ino = le32_to_cpu(de->ino); fake_mode = get_de_type(de) << S_SHIFT; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc38e5a92b4b..d2ba37a84f8e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -270,8 +270,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&e->ni, ne); } else { - f2fs_bug_on(sbi, nat_get_ino(e) != ne->ino || - nat_get_blkaddr(e) != ne->block_addr || + f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || + nat_get_blkaddr(e) != + le32_to_cpu(ne->block_addr) || nat_get_version(e) != ne->version); } } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 868bec65e51c..cfdcf98516a1 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -313,7 +313,7 @@ static inline bool is_recoverable_dnode(struct page *page) ((unsigned char *)ckpt + crc_offset))); cp_ver |= (crc << 32); } - return cpu_to_le64(cp_ver) == cpver_of_node(page); + return cp_ver == cpver_of_node(page); } /* From d9d712862732783e16998d11f9b9867d28e282fd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:56:59 +0800 Subject: [PATCH 416/546] f2fs: clear nlink if fail to add_link commit a11b9f65eae766b17ec3451a6a1766f0a9d1dbff upstream. We don't need to keep incomplete created inode in cache, so if we fail to add link into directory during new inode creation, it's better to set nlink of inode to zero, then we can evict inode immediately. Otherwise release of nid belong to inode will be delayed until inode cache is being shrunk, it may cause a seemingly endless loop while allocating free nids in time of testing generic/269 case of fstest suit. Signed-off-by: Chao Yu [Jaegeuk Kim: add update_inode_page to fix kernel panic] Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 1fbebcb33a9d..d32fd0343eae 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -387,6 +387,8 @@ void f2fs_evict_inode(struct inode *inode) f2fs_lock_op(sbi); err = remove_inode_page(inode); f2fs_unlock_op(sbi); + if (err == -ENOENT) + err = 0; } /* give more chances, if ENOMEM case */ @@ -427,6 +429,18 @@ void handle_failed_inode(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct node_info ni; + /* + * clear nlink of inode in order to release resource of inode + * immediately. + */ + clear_nlink(inode); + + /* + * we must call this to avoid inode being remained as dirty, resulting + * in a panic when flushing dirty inodes in gdirty_list. + */ + update_inode_page(inode); + /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); From b28faafd2429a0f949df99b065f38e46184c0a59 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 12 Oct 2016 19:28:29 +0800 Subject: [PATCH 417/546] f2fs: split free nid list commit b8559dc242d1d47dcf99660a4d6afded727e0cc0 upstream. During free nid allocation, in order to do preallocation, we will tag free nid entry as allocated one and still leave it in free nid list, for other allocators who want to grab free nids, it needs to traverse the free nid list for lookup. It becomes overhead in scenario of allocating free nid intensively by multithreads. This patch splits free nid list to two list: {free,alloc}_nid_list, to keep free nids and preallocated free nids separately, after that, traverse latency will be gone, besides split nid_cnt for separate statistic. Additionally, introduce __insert_nid_to_list and __remove_nid_from_list for cleanup. Signed-off-by: Chao Yu [Jaegeuk Kim: modify f2fs_bug_on to avoid needless branches] Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 11 ++-- fs/f2fs/f2fs.h | 14 +++-- fs/f2fs/node.c | 136 +++++++++++++++++++++++++++------------------ fs/f2fs/node.h | 11 ++-- fs/f2fs/shrinker.c | 4 +- 5 files changed, 108 insertions(+), 68 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 1c35e80732e0..5cab80dfa4dc 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -74,7 +74,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->dirty_nats = NM_I(sbi)->dirty_nat_cnt; si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; - si->fnids = NM_I(sbi)->fcnt; + si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) * 100 / (int)(sbi->user_block_count >> sbi->log_blocks_per_seg) @@ -194,7 +195,9 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ - si->cache_mem += NM_I(sbi)->fcnt * sizeof(struct free_nid); + si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + + NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]) * + sizeof(struct free_nid); si->cache_mem += NM_I(sbi)->nat_cnt * sizeof(struct nat_entry); si->cache_mem += NM_I(sbi)->dirty_nat_cnt * sizeof(struct nat_entry_set); @@ -324,8 +327,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d\n", - si->fnids); + seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", + si->free_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 45d1e4522760..cec025852c22 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -529,6 +529,12 @@ static inline void __try_update_largest_extent(struct inode *inode, } } +enum nid_list { + FREE_NID_LIST, + ALLOC_NID_LIST, + MAX_NID_LIST, +}; + struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ @@ -548,9 +554,9 @@ struct f2fs_nm_info { /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ - struct list_head free_nid_list; /* a list for free nids */ - spinlock_t free_nid_list_lock; /* protect free nid list */ - unsigned int fcnt; /* the number of free node id */ + struct list_head nid_list[MAX_NID_LIST];/* lists for free nids */ + unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ + spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ /* for checkpoint */ @@ -2214,7 +2220,7 @@ struct f2fs_stat_info { s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; s64 inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, fnids; + int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; int bg_gc, wb_bios; int inline_xattr, inline_inode, inline_dir, orphans; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index d2ba37a84f8e..5bb2fa324e68 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -45,8 +45,8 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) * give 25%, 25%, 50%, 50%, 50% memory for each components respectively */ if (type == FREE_NIDS) { - mem_size = (nm_i->fcnt * sizeof(struct free_nid)) >> - PAGE_SHIFT; + mem_size = (nm_i->nid_cnt[FREE_NID_LIST] * + sizeof(struct free_nid)) >> PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 2); } else if (type == NAT_ENTRIES) { mem_size = (nm_i->nat_cnt * sizeof(struct nat_entry)) >> @@ -1699,10 +1699,31 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, struct free_nid *i) { - list_del(&i->list); radix_tree_delete(&nm_i->free_nid_root, i->nid); } +static void __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]++; + list_add_tail(&i->list, &nm_i->nid_list[list]); +} + +static void __remove_nid_from_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : + i->state != NID_ALLOC); + nm_i->nid_cnt[list]--; + list_del(&i->list); +} + static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1733,33 +1754,33 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) return 0; } - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); kmem_cache_free(free_nid_slab, i); return 0; } - list_add_tail(&i->list, &nm_i->free_nid_list); - nm_i->fcnt++; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, FREE_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); return 1; } -static void remove_free_nid(struct f2fs_nm_info *nm_i, nid_t nid) +static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) { + struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; bool need_free = false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; need_free = true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1798,7 +1819,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nid_t nid = nm_i->next_scan_nid; /* Enough entries */ - if (nm_i->fcnt >= NAT_ENTRY_PER_BLOCK) + if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; /* readahead nat pages to be scanned */ @@ -1834,7 +1855,7 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (addr == NULL_ADDR) add_free_nid(sbi, nid, true); else - remove_free_nid(nm_i, nid); + remove_free_nid(sbi, nid); } up_read(&curseg->journal_rwsem); up_read(&nm_i->nat_tree_lock); @@ -1867,23 +1888,22 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) return false; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); /* We should not use stale free nids created by build_free_nids */ - if (nm_i->fcnt && !on_build_free_nids(nm_i)) { - f2fs_bug_on(sbi, list_empty(&nm_i->free_nid_list)); - list_for_each_entry(i, &nm_i->free_nid_list, list) - if (i->state == NID_NEW) - break; - - f2fs_bug_on(sbi, i->state != NID_NEW); + if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { + f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); + i = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], + struct free_nid, list); *nid = i->nid; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); i->state = NID_ALLOC; - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + spin_unlock(&nm_i->nid_list_lock); return true; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ build_free_nids(sbi); @@ -1898,11 +1918,12 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); __del_from_free_nid_list(nm_i, i); - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); } @@ -1919,17 +1940,20 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) if (!nid) return; - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); - f2fs_bug_on(sbi, !i || i->state != NID_ALLOC); + f2fs_bug_on(sbi, !i); + + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); + if (!available_free_memory(sbi, FREE_NIDS)) { __del_from_free_nid_list(nm_i, i); need_free = true; } else { i->state = NID_NEW; - nm_i->fcnt++; + __insert_nid_to_list(sbi, i, FREE_NID_LIST); } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); if (need_free) kmem_cache_free(free_nid_slab, i); @@ -1941,24 +1965,26 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) struct free_nid *i, *next; int nr = nr_shrink; - if (nm_i->fcnt <= MAX_FREE_NIDS) + if (nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) return 0; if (!mutex_trylock(&nm_i->build_lock)) return 0; - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next, &nm_i->free_nid_list, list) { - if (nr_shrink <= 0 || nm_i->fcnt <= MAX_FREE_NIDS) + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next, &nm_i->nid_list[FREE_NID_LIST], + list) { + if (nr_shrink <= 0 || + nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - if (i->state == NID_ALLOC) - continue; + + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); + kmem_cache_free(free_nid_slab, i); - nm_i->fcnt--; nr_shrink--; } - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); mutex_unlock(&nm_i->build_lock); return nr - nr_shrink; @@ -2014,7 +2040,7 @@ void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) if (unlikely(!inc_valid_node_count(sbi, inode))) f2fs_bug_on(sbi, 1); - remove_free_nid(NM_I(sbi), new_xnid); + remove_free_nid(sbi, new_xnid); get_node_info(sbi, new_xnid, &ni); ni.ino = inode->i_ino; set_node_addr(sbi, &ni, NEW_ADDR, false); @@ -2044,7 +2070,7 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) } /* Should not use this inode from free nid list */ - remove_free_nid(NM_I(sbi), ino); + remove_free_nid(sbi, ino); if (!PageUptodate(ipage)) SetPageUptodate(ipage); @@ -2278,20 +2304,22 @@ static int init_node_manager(struct f2fs_sb_info *sbi) /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; - nm_i->fcnt = 0; + nm_i->nid_cnt[FREE_NID_LIST] = 0; + nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; nm_i->ram_thresh = DEF_RAM_THRESHOLD; nm_i->ra_nid_pages = DEF_RA_NID_PAGES; nm_i->dirty_nats_ratio = DEF_DIRTY_NAT_RATIO_THRESHOLD; INIT_RADIX_TREE(&nm_i->free_nid_root, GFP_ATOMIC); - INIT_LIST_HEAD(&nm_i->free_nid_list); + INIT_LIST_HEAD(&nm_i->nid_list[FREE_NID_LIST]); + INIT_LIST_HEAD(&nm_i->nid_list[ALLOC_NID_LIST]); INIT_RADIX_TREE(&nm_i->nat_root, GFP_NOIO); INIT_RADIX_TREE(&nm_i->nat_set_root, GFP_NOIO); INIT_LIST_HEAD(&nm_i->nat_entries); mutex_init(&nm_i->build_lock); - spin_lock_init(&nm_i->free_nid_list_lock); + spin_lock_init(&nm_i->nid_list_lock); init_rwsem(&nm_i->nat_tree_lock); nm_i->next_scan_nid = le32_to_cpu(sbi->ckpt->next_free_nid); @@ -2336,17 +2364,19 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) return; /* destroy free nid list */ - spin_lock(&nm_i->free_nid_list_lock); - list_for_each_entry_safe(i, next_i, &nm_i->free_nid_list, list) { - f2fs_bug_on(sbi, i->state == NID_ALLOC); + spin_lock(&nm_i->nid_list_lock); + list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], + list) { + __remove_nid_from_list(sbi, i, FREE_NID_LIST); __del_from_free_nid_list(nm_i, i); - nm_i->fcnt--; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); - spin_lock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); } - f2fs_bug_on(sbi, nm_i->fcnt); - spin_unlock(&nm_i->free_nid_list_lock); + f2fs_bug_on(sbi, nm_i->nid_cnt[FREE_NID_LIST]); + f2fs_bug_on(sbi, nm_i->nid_cnt[ALLOC_NID_LIST]); + f2fs_bug_on(sbi, !list_empty(&nm_i->nid_list[ALLOC_NID_LIST])); + spin_unlock(&nm_i->nid_list_lock); /* destroy nat cache */ down_write(&nm_i->nat_tree_lock); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index cfdcf98516a1..e7997e240366 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -169,14 +169,15 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *fnid; - spin_lock(&nm_i->free_nid_list_lock); - if (nm_i->fcnt <= 0) { - spin_unlock(&nm_i->free_nid_list_lock); + spin_lock(&nm_i->nid_list_lock); + if (nm_i->nid_cnt[FREE_NID_LIST] <= 0) { + spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->free_nid_list.next, struct free_nid, list); + fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + struct free_nid, list); *nid = fnid->nid; - spin_unlock(&nm_i->free_nid_list_lock); + spin_unlock(&nm_i->nid_list_lock); } /* diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 46c915425923..ec539f407cc4 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -26,8 +26,8 @@ static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->fcnt > MAX_FREE_NIDS) - return NM_I(sbi)->fcnt - MAX_FREE_NIDS; + if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) + return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; return 0; } From aa192f921db8415e7f56e52cb477a13a943d0f47 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 12 Oct 2016 10:09:59 -0700 Subject: [PATCH 418/546] f2fs: clean up free nid list operations commit eb0aa4b80784b8551bd5be577024e067bc83ef94 upstream. This patch cleans up to use consistent free nid list ops. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 56 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5bb2fa324e68..ef5357c7af24 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1696,25 +1696,26 @@ static struct free_nid *__lookup_free_nid_list(struct f2fs_nm_info *nm_i, return radix_tree_lookup(&nm_i->free_nid_root, n); } -static void __del_from_free_nid_list(struct f2fs_nm_info *nm_i, - struct free_nid *i) -{ - radix_tree_delete(&nm_i->free_nid_root, i->nid); -} - -static void __insert_nid_to_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) +static int __insert_nid_to_list(struct f2fs_sb_info *sbi, + struct free_nid *i, enum nid_list list, bool new) { struct f2fs_nm_info *nm_i = NM_I(sbi); + if (new) { + int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) + return err; + } + f2fs_bug_on(sbi, list == FREE_NID_LIST ? i->state != NID_NEW : i->state != NID_ALLOC); nm_i->nid_cnt[list]++; list_add_tail(&i->list, &nm_i->nid_list[list]); + return 0; } static void __remove_nid_from_list(struct f2fs_sb_info *sbi, - struct free_nid *i, enum nid_list list) + struct free_nid *i, enum nid_list list, bool reuse) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -1722,6 +1723,8 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, i->state != NID_ALLOC); nm_i->nid_cnt[list]--; list_del(&i->list); + if (!reuse) + radix_tree_delete(&nm_i->free_nid_root, i->nid); } static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) @@ -1729,6 +1732,7 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i; struct nat_entry *ne; + int err; if (!available_free_memory(sbi, FREE_NIDS)) return -1; @@ -1755,15 +1759,13 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) } spin_lock(&nm_i->nid_list_lock); - if (radix_tree_insert(&nm_i->free_nid_root, i->nid, i)) { - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); + err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); + spin_unlock(&nm_i->nid_list_lock); + radix_tree_preload_end(); + if (err) { kmem_cache_free(free_nid_slab, i); return 0; } - __insert_nid_to_list(sbi, i, FREE_NID_LIST); - spin_unlock(&nm_i->nid_list_lock); - radix_tree_preload_end(); return 1; } @@ -1776,8 +1778,7 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); if (i && i->state == NID_NEW) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); need_free = true; } spin_unlock(&nm_i->nid_list_lock); @@ -1897,9 +1898,9 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid, list); *nid = i->nid; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; - __insert_nid_to_list(sbi, i, ALLOC_NID_LIST); + __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1921,8 +1922,7 @@ void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid) spin_lock(&nm_i->nid_list_lock); i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); @@ -1944,14 +1944,13 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i = __lookup_free_nid_list(nm_i, nid); f2fs_bug_on(sbi, !i); - __remove_nid_from_list(sbi, i, ALLOC_NID_LIST); - if (!available_free_memory(sbi, FREE_NIDS)) { - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, false); need_free = true; } else { + __remove_nid_from_list(sbi, i, ALLOC_NID_LIST, true); i->state = NID_NEW; - __insert_nid_to_list(sbi, i, FREE_NID_LIST); + __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } spin_unlock(&nm_i->nid_list_lock); @@ -1978,9 +1977,7 @@ int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) nm_i->nid_cnt[FREE_NID_LIST] <= MAX_FREE_NIDS) break; - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); - + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); kmem_cache_free(free_nid_slab, i); nr_shrink--; } @@ -2367,8 +2364,7 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) spin_lock(&nm_i->nid_list_lock); list_for_each_entry_safe(i, next_i, &nm_i->nid_list[FREE_NID_LIST], list) { - __remove_nid_from_list(sbi, i, FREE_NID_LIST); - __del_from_free_nid_list(nm_i, i); + __remove_nid_from_list(sbi, i, FREE_NID_LIST, false); spin_unlock(&nm_i->nid_list_lock); kmem_cache_free(free_nid_slab, i); spin_lock(&nm_i->nid_list_lock); From dbca3031f03df46b4e3f92c0bbd41eaacc30b630 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:35 +0800 Subject: [PATCH 419/546] f2fs: don't interrupt free nids building during nid allocation commit 3a2ad5672bb36ee9c07bab97dadc8b0f70d391f4 upstream. Let build_free_nids support sync/async methods, in allocation flow of nids, we use synchronuous method, so that we can avoid looping in alloc_nid when free memory is low; in unblock_operations and f2fs_balance_fs_bg we use asynchronuous method in where low memory condition can interrupt us. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 22 ++++++++++------------ fs/f2fs/segment.c | 2 +- 4 files changed, 13 insertions(+), 15 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 63ca342a3cc8..1d273f51bc1c 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -990,7 +990,7 @@ static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - build_free_nids(sbi); + build_free_nids(sbi, false); f2fs_unlock_all(sbi); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index cec025852c22..9e8d3c9af54a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2071,7 +2071,7 @@ void move_node_page(struct page *, int); int fsync_node_pages(struct f2fs_sb_info *, struct inode *, struct writeback_control *, bool); int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *); +void build_free_nids(struct f2fs_sb_info *, bool); bool alloc_nid(struct f2fs_sb_info *, nid_t *); void alloc_nid_done(struct f2fs_sb_info *, nid_t); void alloc_nid_failed(struct f2fs_sb_info *, nid_t); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index ef5357c7af24..5800a1082fe8 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1734,9 +1734,6 @@ static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) struct nat_entry *ne; int err; - if (!available_free_memory(sbi, FREE_NIDS)) - return -1; - /* 0 nid should not be used */ if (unlikely(nid == 0)) return 0; @@ -1804,14 +1801,12 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); - if (blk_addr == NULL_ADDR) { - if (add_free_nid(sbi, start_nid, true) < 0) - break; - } + if (blk_addr == NULL_ADDR) + add_free_nid(sbi, start_nid, true); } } -void __build_free_nids(struct f2fs_sb_info *sbi) +void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1823,6 +1818,9 @@ void __build_free_nids(struct f2fs_sb_info *sbi) if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; + if (!sync && !available_free_memory(sbi, FREE_NIDS)) + return; + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1865,10 +1863,10 @@ void __build_free_nids(struct f2fs_sb_info *sbi) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi); + __build_free_nids(sbi, sync); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1907,7 +1905,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi); + build_free_nids(sbi, true); goto retry; } @@ -2344,7 +2342,7 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi); + build_free_nids(sbi, true); return 0; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 75477ec6c535..48903702de27 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -380,7 +380,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi); + build_free_nids(sbi, false); /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || From 865d1e287fc2ae3d6c2f75b87f92373c3e01e260 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 11 Oct 2016 22:31:36 +0800 Subject: [PATCH 420/546] f2fs: avoid casted negative value as shrink count commit 02110a4fd53164db7cce3bb2780dce4d6c4e058f upstream. This patch makes sure it returns a positive value instead of a probable casted negative value as shrink count. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/shrinker.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index ec539f407cc4..5c60fc28ec75 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -21,14 +21,16 @@ static unsigned int shrinker_run_no; static unsigned long __count_nat_entries(struct f2fs_sb_info *sbi) { - return NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + long count = NM_I(sbi)->nat_cnt - NM_I(sbi)->dirty_nat_cnt; + + return count > 0 ? count : 0; } static unsigned long __count_free_nids(struct f2fs_sb_info *sbi) { - if (NM_I(sbi)->nid_cnt[FREE_NID_LIST] > MAX_FREE_NIDS) - return NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; - return 0; + long count = NM_I(sbi)->nid_cnt[FREE_NID_LIST] - MAX_FREE_NIDS; + + return count > 0 ? count : 0; } static unsigned long __count_extent_cache(struct f2fs_sb_info *sbi) From d3630022eb25e6f91490242404bf767f7a9502bd Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:28:05 -0700 Subject: [PATCH 421/546] f2fs: count dirty inodes to flush node pages during checkpoint commit b9610bdfcbdbb6017802ec6d1e073f445c98157d upstream. If there are a lot of dirty inodes, we need to flush all of them when doing checkpoint. So, we need to count this for enough free space. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index fecb856ad874..762743988426 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -471,11 +471,12 @@ static inline bool need_SSR(struct f2fs_sb_info *sbi) { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (test_opt(sbi, LFS)) return false; - return free_sections(sbi) <= (node_secs + 2 * dent_secs + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + reserved_sections(sbi) + 1); } @@ -484,14 +485,14 @@ static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, { int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - - node_secs += get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) return false; return (free_sections(sbi) + freed) <= - (node_secs + 2 * dent_secs + reserved_sections(sbi) + needed); + (node_secs + 2 * dent_secs + imeta_secs + + reserved_sections(sbi) + needed); } static inline bool excess_prefree_segs(struct f2fs_sb_info *sbi) From 3e1db115c89b6bbd1975c2ea1ee5dc1b853c1d08 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 13:30:31 -0700 Subject: [PATCH 422/546] f2fs: call f2fs_balance_fs for setattr commit 15d04354555fdfe8005e1365009e349148fb5f90 upstream. If inode becomes dirty, we need to check the # of dirty inodes whether or not further checkpoint would be required. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index c0774c98dce4..53ba384cb675 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -695,7 +695,6 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) err = f2fs_truncate(inode); if (err) return err; - f2fs_balance_fs(F2FS_I_SB(inode), true); } else { /* * do not trim all blocks after i_size if target size is @@ -724,6 +723,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } f2fs_mark_inode_dirty_sync(inode); + + /* inode change will produce dirty node pages flushed by checkpoint */ + f2fs_balance_fs(F2FS_I_SB(inode), true); + return err; } From ca299726601c253d4ba97e9361953feda584aa5c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 17 Oct 2016 15:36:31 -0700 Subject: [PATCH 423/546] f2fs: declare static function for __build_free_nids commit 3e7b5bbbef7f5eb8a19aa61b611c704bf8230937 upstream. This patch avoids build warning. Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5800a1082fe8..e1ce0b8438fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1806,7 +1806,7 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, } } -void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); From 6beab0ddde1c81dc3ec7fee8713c01f2c8868b4e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 18 Oct 2016 11:07:45 -0700 Subject: [PATCH 424/546] f2fs: use BIO_MAX_PAGES for bio allocation commit 664ba972df9b96942191db3068274cc1db899774 upstream. We don't need to allocate bio partially in order to maximize sequential writes. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 4 +--- fs/f2fs/node.c | 3 +-- fs/f2fs/segment.c | 4 ++-- fs/f2fs/segment.h | 17 +++-------------- 5 files changed, 8 insertions(+), 22 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1d273f51bc1c..1dffe86651be 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -226,7 +226,7 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) f2fs_put_page(page, 0); if (readahead) - ra_meta_pages(sbi, index, MAX_BIO_BLOCKS(sbi), META_POR, true); + ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } static int f2fs_write_meta_page(struct page *page, diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 0cdabfce6c17..e92cf046d9ff 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -273,10 +273,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { - int bio_blocks = MAX_BIO_BLOCKS(sbi); - io->bio = __bio_alloc(sbi, fio->new_blkaddr, - bio_blocks, is_read); + BIO_MAX_PAGES, is_read); io->fio = *fio; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index e1ce0b8438fc..389be7f6e07c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2099,7 +2099,6 @@ int restore_node_summary(struct f2fs_sb_info *sbi, struct f2fs_node *rn; struct f2fs_summary *sum_entry; block_t addr; - int bio_blocks = MAX_BIO_BLOCKS(sbi); int i, idx, last_offset, nrpages; /* scan the node segment */ @@ -2108,7 +2107,7 @@ int restore_node_summary(struct f2fs_sb_info *sbi, sum_entry = &sum->entries[0]; for (i = 0; i < last_offset; i += nrpages, addr += nrpages) { - nrpages = min(last_offset - i, bio_blocks); + nrpages = min(last_offset - i, BIO_MAX_PAGES); /* readahead node pages */ ra_meta_pages(sbi, addr, nrpages, META_POR, true); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 48903702de27..ec4d74c26067 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2239,10 +2239,10 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) int sit_blk_cnt = SIT_BLK_CNT(sbi); unsigned int i, start, end; unsigned int readed, start_blk = 0; - int nrpages = MAX_BIO_BLOCKS(sbi) * 8; do { - readed = ra_meta_pages(sbi, start_blk, nrpages, META_SIT, true); + readed = ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES, + META_SIT, true); start = start_blk * sit_i->sents_per_block; end = (start_blk + readed) * sit_i->sents_per_block; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 762743988426..89ab4301ef02 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -102,8 +102,6 @@ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) -#define MAX_BIO_BLOCKS(sbi) \ - ((int)min((int)max_hw_blocks(sbi), BIO_MAX_PAGES)) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -696,13 +694,6 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) return false; } -static inline unsigned int max_hw_blocks(struct f2fs_sb_info *sbi) -{ - struct block_device *bdev = sbi->sb->s_bdev; - struct request_queue *q = bdev_get_queue(bdev); - return SECTOR_TO_BLOCK(queue_max_sectors(q)); -} - /* * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. @@ -720,7 +711,7 @@ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) else if (type == NODE) return 8 * sbi->blocks_per_seg; else if (type == META) - return 8 * MAX_BIO_BLOCKS(sbi); + return 8 * BIO_MAX_PAGES; else return 0; } @@ -737,11 +728,9 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, return 0; nr_to_write = wbc->nr_to_write; - + desired = BIO_MAX_PAGES; if (type == NODE) - desired = 2 * max_hw_blocks(sbi); - else - desired = MAX_BIO_BLOCKS(sbi); + desired <<= 1; wbc->nr_to_write = desired; return desired - nr_to_write; From 4bb3b1e330d7ba1f58e9b52997e8d8f61679cef1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 10 Nov 2016 18:04:05 -0800 Subject: [PATCH 425/546] f2fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps commit 02027d42c3f747945f19111d3da2092ed2148ac8 upstream. This is for backport only. fs: Replace CURRENT_TIME_SEC with current_time() for inode timestamps Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 8 ++++---- fs/f2fs/f2fs.h | 22 ++++++++++++++++++++++ fs/f2fs/file.c | 8 ++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/namei.c | 8 ++++---- fs/f2fs/xattr.c | 2 +- 6 files changed, 36 insertions(+), 14 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 7136dc1ade11..3b8ebec0450b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -312,7 +312,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, f2fs_dentry_kunmap(dir, page); set_page_dirty(page); - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); f2fs_put_page(page, 1); } @@ -465,7 +465,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, f2fs_i_links_write(dir, true); clear_inode_flag(inode, FI_NEW_INODE); } - dir->i_mtime = dir->i_ctime = CURRENT_TIME; + dir->i_mtime = dir->i_ctime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (F2FS_I(dir)->i_current_depth != current_depth) @@ -683,7 +683,7 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) if (S_ISDIR(inode->i_mode)) f2fs_i_links_write(dir, false); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_i_links_write(inode, false); if (S_ISDIR(inode->i_mode)) { @@ -730,7 +730,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, kunmap(page); /* kunmap - pair of f2fs_find_entry */ set_page_dirty(page); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 9e8d3c9af54a..1938fe457041 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -138,6 +138,28 @@ static inline void inode_nohighmem(struct inode *inode) mapping_set_gfp_mask(inode->i_mapping, GFP_USER); } +/** + * current_time - Return FS time + * @inode: inode. + * + * Return the current time truncated to the time granularity supported by + * the fs. + * + * Note that inode and inode->sb cannot be NULL. + * Otherwise, the function warns and returns time without truncation. + */ +static inline struct timespec current_time(struct inode *inode) +{ + struct timespec now = current_kernel_time(); + + if (unlikely(!inode->i_sb)) { + WARN(1, "current_time() called with uninitialized super_block in the inode"); + return now; + } + + return timespec_trunc(now, inode->i_sb->s_time_gran); +} + /* * For checkpoint manager */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 53ba384cb675..04a3205d2934 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -632,7 +632,7 @@ int f2fs_truncate(struct inode *inode) if (err) return err; - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); return 0; } @@ -708,7 +708,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; } - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); } } @@ -1402,7 +1402,7 @@ static long f2fs_fallocate(struct file *file, int mode, } if (!ret) { - inode->i_mtime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } @@ -1494,7 +1494,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) fi->i_flags = flags; inode_unlock(inode); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); out: mnt_drop_write_file(filp); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 56363c18489a..3b2f88788e02 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -591,7 +591,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); f2fs_put_page(page, 1); - dir->i_ctime = dir->i_mtime = CURRENT_TIME; + dir->i_ctime = dir->i_mtime = current_time(dir); f2fs_mark_inode_dirty_sync(dir); if (inode) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 0f071a70522d..ae29726afff0 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -46,7 +46,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) inode->i_ino = ino; inode->i_blocks = 0; - inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME; + inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode); inode->i_generation = sbi->s_next_generation++; err = insert_inode_locked(inode); @@ -182,7 +182,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, f2fs_balance_fs(sbi, true); - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); ihold(inode); set_inode_flag(inode, FI_INC_LINK); @@ -720,7 +720,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_set_link(new_dir, new_entry, new_page, old_inode); - new_inode->i_ctime = CURRENT_TIME; + new_inode->i_ctime = current_time(new_inode); down_write(&F2FS_I(new_inode)->i_sem); if (old_dir_entry) f2fs_i_links_write(new_inode, false); @@ -774,7 +774,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, file_set_enc_name(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_inode->i_ctime = CURRENT_TIME; + old_inode->i_ctime = current_time(old_inode); f2fs_mark_inode_dirty_sync(old_inode); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 69c6bb9cf207..3a42405b6515 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -588,7 +588,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (is_inode_flag_set(inode, FI_ACL_MODE)) { inode->i_mode = F2FS_I(inode)->i_acl_mode; - inode->i_ctime = CURRENT_TIME; + inode->i_ctime = current_time(inode); clear_inode_flag(inode, FI_ACL_MODE); } if (index == F2FS_XATTR_INDEX_ENCRYPTION && From 57d5761d0b247ab4308ef01ca3d4629c71242d0e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 14 Oct 2016 11:51:23 -0700 Subject: [PATCH 426/546] f2fs: keep dirty inodes selectively for checkpoint commit 7c45729a4d6d1c90879e6c5c2df325c2f6db7191 upstream. This is to avoid no free segment bug during checkpoint caused by a number of dirty inodes. The case was reported by Chao like this. 1. mount with lazytime option 2. fill 4k file until disk is full 3. sync filesystem 4. read all files in the image 5. umount In this case, we actually don't need to flush dirty inode to inode page during checkpoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/acl.c | 2 +- fs/f2fs/dir.c | 6 +++--- fs/f2fs/extent_cache.c | 2 +- fs/f2fs/f2fs.h | 26 +++++++++++++------------- fs/f2fs/file.c | 9 +++++---- fs/f2fs/inline.c | 2 +- fs/f2fs/inode.c | 7 ++++--- fs/f2fs/namei.c | 6 +++--- fs/f2fs/super.c | 29 ++++++++++++++++------------- fs/f2fs/xattr.c | 4 ++-- 10 files changed, 49 insertions(+), 44 deletions(-) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 4a34040932e9..a45d1f4b7b0f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -387,7 +387,7 @@ int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, if (error) return error; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { error = __f2fs_set_acl(inode, ACL_TYPE_DEFAULT, default_acl, diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 3b8ebec0450b..5594667c2f41 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -313,7 +313,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, set_page_dirty(page); dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); f2fs_put_page(page, 1); } @@ -466,7 +466,7 @@ void update_parent_metadata(struct inode *dir, struct inode *inode, clear_inode_flag(inode, FI_NEW_INODE); } dir->i_mtime = dir->i_ctime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (F2FS_I(dir)->i_current_depth != current_depth) f2fs_i_depth_write(dir, current_depth); @@ -731,7 +731,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, set_page_dirty(page); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 2b06d4fcd954..4db44da7ef69 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -172,7 +172,7 @@ static void __drop_largest_extent(struct inode *inode, if (fofs < largest->fofs + largest->len && fofs + len > largest->fofs) { largest->len = 0; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1938fe457041..0d2502fdf892 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -541,13 +541,13 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *); +extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { if (en->ei.len > et->largest.len) { et->largest = en->ei; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1680,7 +1680,7 @@ static inline void __mark_inode_dirty_flag(struct inode *inode, return; case FI_DATA_EXIST: case FI_INLINE_DOTS: - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } } @@ -1707,7 +1707,7 @@ static inline void set_acl_inode(struct inode *inode, umode_t mode) { F2FS_I(inode)->i_acl_mode = mode; set_inode_flag(inode, FI_ACL_MODE); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static inline void f2fs_i_links_write(struct inode *inode, bool inc) @@ -1716,7 +1716,7 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) inc_nlink(inode); else drop_nlink(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_blocks_write(struct inode *inode, @@ -1727,7 +1727,7 @@ static inline void f2fs_i_blocks_write(struct inode *inode, inode->i_blocks = add ? inode->i_blocks + diff : inode->i_blocks - diff; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1741,7 +1741,7 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) return; i_size_write(inode, i_size); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); } @@ -1756,19 +1756,19 @@ static inline bool f2fs_skip_inode_update(struct inode *inode) static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_xnid_write(struct inode *inode, nid_t xnid) { F2FS_I(inode)->i_xattr_nid = xnid; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void f2fs_i_pino_write(struct inode *inode, nid_t pino) { F2FS_I(inode)->i_pino = pino; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) @@ -1896,13 +1896,13 @@ static inline int is_file(struct inode *inode, int type) static inline void set_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise |= type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline void clear_file(struct inode *inode, int type) { F2FS_I(inode)->i_advise &= ~type; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); } static inline int f2fs_readonly(struct super_block *sb) @@ -2054,7 +2054,7 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *); +int f2fs_inode_dirtied(struct inode *, bool); void f2fs_inode_synced(struct inode *); int f2fs_commit_super(struct f2fs_sb_info *, bool); int f2fs_sync_fs(struct super_block *, int); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 04a3205d2934..ce38a350fb38 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -265,7 +265,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } if (need_inode_block_update(sbi, ino)) { - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); f2fs_write_inode(inode, NULL); goto sync_nodes; } @@ -633,7 +633,7 @@ int f2fs_truncate(struct inode *inode) return err; inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); return 0; } @@ -722,7 +722,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - f2fs_mark_inode_dirty_sync(inode); + /* update attributes only */ + f2fs_mark_inode_dirty_sync(inode, false); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); @@ -1403,7 +1404,7 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3b2f88788e02..b859d5e377ae 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -592,7 +592,7 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_put_page(page, 1); dir->i_ctime = dir->i_mtime = current_time(dir); - f2fs_mark_inode_dirty_sync(dir); + f2fs_mark_inode_dirty_sync(dir, false); if (inode) f2fs_drop_nlink(dir, inode); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index d32fd0343eae..bfa512dde4ab 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -19,10 +19,11 @@ #include -void f2fs_mark_inode_dirty_sync(struct inode *inode) +void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) { - if (f2fs_inode_dirtied(inode)) + if (f2fs_inode_dirtied(inode, sync)) return; + mark_inode_dirty_sync(inode); } @@ -43,7 +44,7 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ae29726afff0..7f2fdb154180 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -775,7 +775,7 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); - f2fs_mark_inode_dirty_sync(old_inode); + f2fs_mark_inode_dirty_sync(old_inode, false); f2fs_delete_entry(old_entry, old_page, old_dir, NULL); @@ -935,7 +935,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(old_dir, old_nlink > 0); up_write(&F2FS_I(old_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(old_dir); + f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ f2fs_set_link(new_dir, new_entry, new_page, old_inode); @@ -950,7 +950,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_i_links_write(new_dir, new_nlink > 0); up_write(&F2FS_I(new_dir)->i_sem); } - f2fs_mark_inode_dirty_sync(new_dir); + f2fs_mark_inode_dirty_sync(new_dir, false); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 006138a6c5ab..23190a94840b 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -620,24 +620,25 @@ static int f2fs_drop_inode(struct inode *inode) return generic_drop_inode(inode); } -int f2fs_inode_dirtied(struct inode *inode) +int f2fs_inode_dirtied(struct inode *inode, bool sync) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + int ret = 0; spin_lock(&sbi->inode_lock[DIRTY_META]); if (is_inode_flag_set(inode, FI_DIRTY_INODE)) { - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return 1; + ret = 1; + } else { + set_inode_flag(inode, FI_DIRTY_INODE); + stat_inc_dirty_inode(sbi, DIRTY_META); } - - set_inode_flag(inode, FI_DIRTY_INODE); - list_add_tail(&F2FS_I(inode)->gdirty_list, + if (sync && list_empty(&F2FS_I(inode)->gdirty_list)) { + list_add_tail(&F2FS_I(inode)->gdirty_list, &sbi->inode_list[DIRTY_META]); - inc_page_count(sbi, F2FS_DIRTY_IMETA); - stat_inc_dirty_inode(sbi, DIRTY_META); + inc_page_count(sbi, F2FS_DIRTY_IMETA); + } spin_unlock(&sbi->inode_lock[DIRTY_META]); - - return 0; + return ret; } void f2fs_inode_synced(struct inode *inode) @@ -649,10 +650,12 @@ void f2fs_inode_synced(struct inode *inode) spin_unlock(&sbi->inode_lock[DIRTY_META]); return; } - list_del_init(&F2FS_I(inode)->gdirty_list); + if (!list_empty(&F2FS_I(inode)->gdirty_list)) { + list_del_init(&F2FS_I(inode)->gdirty_list); + dec_page_count(sbi, F2FS_DIRTY_IMETA); + } clear_inode_flag(inode, FI_DIRTY_INODE); clear_inode_flag(inode, FI_AUTO_RECOVER); - dec_page_count(sbi, F2FS_DIRTY_IMETA); stat_dec_dirty_inode(F2FS_I_SB(inode), DIRTY_META); spin_unlock(&sbi->inode_lock[DIRTY_META]); } @@ -676,7 +679,7 @@ static void f2fs_dirty_inode(struct inode *inode, int flags) if (is_inode_flag_set(inode, FI_AUTO_RECOVER)) clear_inode_flag(inode, FI_AUTO_RECOVER); - f2fs_inode_dirtied(inode); + f2fs_inode_dirtied(inode, false); } static void f2fs_i_callback(struct rcu_head *head) diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3a42405b6515..1c4d5e39586c 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -151,7 +151,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, return -EINVAL; F2FS_I(inode)->i_advise |= *(char *)value; - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); return 0; } @@ -594,7 +594,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (index == F2FS_XATTR_INDEX_ENCRYPTION && !strcmp(name, F2FS_XATTR_NAME_ENCRYPTION_CONTEXT)) f2fs_set_encrypted_inode(inode); - f2fs_mark_inode_dirty_sync(inode); + f2fs_mark_inode_dirty_sync(inode, true); if (!error && S_ISDIR(inode->i_mode)) set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_CP); exit: From 4227aabd6bfd51915a400e25be97589180e358be Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 19 Oct 2016 18:27:56 -0700 Subject: [PATCH 427/546] f2fs: make clean inodes when flushing inode page commit 18340edc8da20b0d399eb25ba4bb631b27652f46 upstream. This patch tries to make more clean inodes when flushing dirty inodes in checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 6 +++++- fs/f2fs/inode.c | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 1dffe86651be..ed79757c36e0 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -924,7 +924,11 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); if (inode) { - update_inode_page(inode); + sync_inode_metadata(inode, 0); + + /* it's on eviction */ + if (is_inode_flag_set(inode, FI_DIRTY_INODE)) + update_inode_page(inode); iput(inode); } }; diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index bfa512dde4ab..7b5e402f0a72 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -339,7 +339,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode)) + if (update_inode_page(inode) && wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } From 2d68ee060c6f95a4f9c44d0d58dcc7ab189b08fa Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 20 Oct 2016 19:09:57 -0700 Subject: [PATCH 428/546] f2fs: remove percpu_count due to performance regression commit 35782b233f37e48ecc469d9c7232f3f6a7fad41a upstream. This patch removes percpu_count usage due to performance regression in iozone. Fixes: 523be8a6b3 ("f2fs: use percpu_counter for page counters") Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 12 ++++++------ fs/f2fs/f2fs.h | 12 ++++++------ fs/f2fs/super.c | 16 +++++----------- 3 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 5cab80dfa4dc..678733c78f55 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -313,17 +313,17 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4lld, wb_bios: %4d\n", + seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", si->inmem_pages, si->wb_bios); - seq_printf(s, " - nodes: %4lld in %4d\n", + seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); - seq_printf(s, " - dents: %4lld in dirs:%4d (%4d)\n", + seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", si->ndirty_dent, si->ndirty_dirs, si->ndirty_all); - seq_printf(s, " - datas: %4lld in files:%4d\n", + seq_printf(s, " - datas: %4d in files:%4d\n", si->ndirty_data, si->ndirty_files); - seq_printf(s, " - meta: %4lld in %4d\n", + seq_printf(s, " - meta: %4d in %4d\n", si->ndirty_meta, si->meta_pages); - seq_printf(s, " - imeta: %4lld\n", + seq_printf(s, " - imeta: %4d\n", si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 0d2502fdf892..932c53f441db 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -872,7 +872,7 @@ struct f2fs_sb_info { atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ - struct percpu_counter nr_pages[NR_COUNT_TYPE]; + atomic_t nr_pages[NR_COUNT_TYPE]; /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; @@ -1286,7 +1286,7 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_inc(&sbi->nr_pages[count_type]); + atomic_inc(&sbi->nr_pages[count_type]); if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) return; @@ -1303,7 +1303,7 @@ static inline void inode_inc_dirty_pages(struct inode *inode) static inline void dec_page_count(struct f2fs_sb_info *sbi, int count_type) { - percpu_counter_dec(&sbi->nr_pages[count_type]); + atomic_dec(&sbi->nr_pages[count_type]); } static inline void inode_dec_dirty_pages(struct inode *inode) @@ -1319,7 +1319,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) { - return percpu_counter_sum_positive(&sbi->nr_pages[count_type]); + return atomic_read(&sbi->nr_pages[count_type]); } static inline s64 get_dirty_pages(struct inode *inode) @@ -2239,8 +2239,8 @@ struct f2fs_stat_info { unsigned long long hit_largest, hit_cached, hit_rbtree; unsigned long long hit_total, total_ext; int ext_tree, zombie_tree, ext_node; - s64 ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; - s64 inmem_pages; + int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; + int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 23190a94840b..6034d51fc5fc 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -696,10 +696,6 @@ static void f2fs_destroy_inode(struct inode *inode) static void destroy_percpu_info(struct f2fs_sb_info *sbi) { - int i; - - for (i = 0; i < NR_COUNT_TYPE; i++) - percpu_counter_destroy(&sbi->nr_pages[i]); percpu_counter_destroy(&sbi->alloc_valid_block_count); percpu_counter_destroy(&sbi->total_valid_inode_count); } @@ -1450,6 +1446,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; + int i; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1474,6 +1471,9 @@ static void init_sb_info(struct f2fs_sb_info *sbi) sbi->interval_time[REQ_TIME] = DEF_IDLE_INTERVAL; clear_sbi_flag(sbi, SBI_NEED_FSCK); + for (i = 0; i < NR_COUNT_TYPE; i++) + atomic_set(&sbi->nr_pages[i], 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); mutex_init(&sbi->wio_mutex[NODE]); @@ -1489,13 +1489,7 @@ static void init_sb_info(struct f2fs_sb_info *sbi) static int init_percpu_info(struct f2fs_sb_info *sbi) { - int i, err; - - for (i = 0; i < NR_COUNT_TYPE; i++) { - err = percpu_counter_init(&sbi->nr_pages[i], 0, GFP_KERNEL); - if (err) - return err; - } + int err; err = percpu_counter_init(&sbi->alloc_valid_block_count, 0, GFP_KERNEL); if (err) From ba169123208a5fecf27054200c7b0d9813335a5d Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Wed, 2 Nov 2016 14:52:15 +0100 Subject: [PATCH 429/546] f2fs: hide a maybe-uninitialized warning MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit 230436b3ef3fd7d4a1da19edf5e87bb2d74e0fc2 upstream. gcc is unsure about the use of last_ofs_in_node, which might happen without a prior initialization: fs/f2fs//git/arm-soc/fs/f2fs/data.c: In function ‘f2fs_map_blocks’: fs/f2fs/data.c:799:54: warning: ‘last_ofs_in_node’ may be used uninitialized in this function [-Wmaybe-uninitialized] if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) { As pointed out by Chao Yu, the code is actually correct as 'prealloc' is only set if the last_ofs_in_node has been set, the two always get updated together. This initializes last_ofs_in_node to dn.ofs_in_node for each new dnode at the start of the 'next_block' loop, which at that point is a correct initialization as well. I assume that compilers that correctly track the contents of the variables and do not warn about the condition also figure out that they can eliminate the extra assignment here. Fixes: 46008c6d4232 ("f2fs: support in batch multi blocks preallocation") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index e92cf046d9ff..f03c34d3797f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -708,7 +708,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } prealloc = 0; - ofs_in_node = dn.ofs_in_node; + last_ofs_in_node = ofs_in_node = dn.ofs_in_node; end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: From c3b06cc324617c33e215502d0c6715f0183b6c32 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 1 Dec 2016 10:44:44 -0800 Subject: [PATCH 430/546] fs/crypto: catch up 4.9-rc6 commit d117b9acaeada0b243f31e0fe83e111fcc9a6644 upstream. Merge tag 'ext4_for_stable' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 fixes from Ted Ts'o: "A security fix (so a maliciously corrupted file system image won't panic the kernel) and some fixes for CONFIG_VMAP_STACK" Signed-off-by: Jaegeuk Kim --- fs/crypto/crypto.c | 26 ++++---- fs/crypto/fname.c | 132 ++++++++++++++++++--------------------- fs/crypto/keyinfo.c | 85 ++++++++++++++++--------- fs/crypto/policy.c | 4 ++ fs/f2fs/dir.c | 6 +- fs/f2fs/namei.c | 6 +- include/linux/fscrypto.h | 24 ------- 7 files changed, 141 insertions(+), 142 deletions(-) diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2fc8c43ce531..2d40ab9edc9f 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -28,7 +28,6 @@ #include #include #include -#include static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -128,11 +127,11 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) EXPORT_SYMBOL(fscrypt_get_ctx); /** - * fscrypt_complete() - The completion callback for page encryption - * @req: The asynchronous encryption request context - * @res: The result of the encryption operation + * page_crypt_complete() - completion callback for page crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void fscrypt_complete(struct crypto_async_request *req, int res) +static void page_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -152,7 +151,10 @@ static int do_page_crypto(struct inode *inode, struct page *src_page, struct page *dest_page, gfp_t gfp_flags) { - u8 xts_tweak[FS_XTS_TWEAK_SIZE]; + struct { + __le64 index; + u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; + } xts_tweak; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -170,19 +172,17 @@ static int do_page_crypto(struct inode *inode, skcipher_request_set_callback( req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - fscrypt_complete, &ecr); + page_crypt_complete, &ecr); - BUILD_BUG_ON(FS_XTS_TWEAK_SIZE < sizeof(index)); - memcpy(xts_tweak, &index, sizeof(index)); - memset(&xts_tweak[sizeof(index)], 0, - FS_XTS_TWEAK_SIZE - sizeof(index)); + BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); + xts_tweak.index = cpu_to_le64(index); + memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); sg_init_table(&dst, 1); sg_set_page(&dst, dest_page, PAGE_SIZE, 0); sg_init_table(&src, 1); sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, - xts_tweak); + skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 5d6d49113efa..9b774f4b50c8 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -10,21 +10,16 @@ * This has not yet undergone a rigorous security audit. */ -#include -#include #include #include #include -static u32 size_round_up(size_t size, size_t blksize) -{ - return ((size + blksize - 1) / blksize) * blksize; -} - /** - * dir_crypt_complete() - + * fname_crypt_complete() - completion callback for filename crypto + * @req: The asynchronous cipher request context + * @res: The result of the cipher operation */ -static void dir_crypt_complete(struct crypto_async_request *req, int res) +static void fname_crypt_complete(struct crypto_async_request *req, int res) { struct fscrypt_completion_result *ecr = req->data; @@ -35,90 +30,80 @@ static void dir_crypt_complete(struct crypto_async_request *req, int res) } /** - * fname_encrypt() - + * fname_encrypt() - encrypt a filename * - * This function encrypts the input filename, and returns the length of the - * ciphertext. Errors are returned as negative numbers. We trust the caller to - * allocate sufficient memory to oname string. + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_encrypt(struct inode *inode, const struct qstr *iname, struct fscrypt_str *oname) { - u32 ciphertext_len; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct fscrypt_info *ci = inode->i_crypt_info; struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; char iv[FS_CRYPTO_BLOCK_SIZE]; - struct scatterlist src_sg, dst_sg; + struct scatterlist sg; int padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - char *workbuf, buf[32], *alloc_buf = NULL; - unsigned lim; + unsigned int lim; + unsigned int cryptlen; lim = inode->i_sb->s_cop->max_namelen(inode); if (iname->len <= 0 || iname->len > lim) return -EIO; - ciphertext_len = (iname->len < FS_CRYPTO_BLOCK_SIZE) ? - FS_CRYPTO_BLOCK_SIZE : iname->len; - ciphertext_len = size_round_up(ciphertext_len, padding); - ciphertext_len = (ciphertext_len > lim) ? lim : ciphertext_len; + /* + * Copy the filename to the output buffer for encrypting in-place and + * pad it with the needed number of NUL bytes. + */ + cryptlen = max_t(unsigned int, iname->len, FS_CRYPTO_BLOCK_SIZE); + cryptlen = round_up(cryptlen, padding); + cryptlen = min(cryptlen, lim); + memcpy(oname->name, iname->name, iname->len); + memset(oname->name + iname->len, 0, cryptlen - iname->len); - if (ciphertext_len <= sizeof(buf)) { - workbuf = buf; - } else { - alloc_buf = kmalloc(ciphertext_len, GFP_NOFS); - if (!alloc_buf) - return -ENOMEM; - workbuf = alloc_buf; - } + /* Initialize the IV */ + memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - /* Allocate request */ + /* Set up the encryption request */ req = skcipher_request_alloc(tfm, GFP_NOFS); if (!req) { printk_ratelimited(KERN_ERR - "%s: crypto_request_alloc() failed\n", __func__); - kfree(alloc_buf); + "%s: skcipher_request_alloc() failed\n", __func__); return -ENOMEM; } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); + sg_init_one(&sg, oname->name, cryptlen); + skcipher_request_set_crypt(req, &sg, &sg, cryptlen, iv); - /* Copy the input */ - memcpy(workbuf, iname->name, iname->len); - if (iname->len < ciphertext_len) - memset(workbuf + iname->len, 0, ciphertext_len - iname->len); - - /* Initialize IV */ - memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); - - /* Create encryption request */ - sg_init_one(&src_sg, workbuf, ciphertext_len); - sg_init_one(&dst_sg, oname->name, ciphertext_len); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, ciphertext_len, iv); + /* Do the encryption */ res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { + /* Request is being completed asynchronously; wait for it */ wait_for_completion(&ecr.completion); res = ecr.res; } - kfree(alloc_buf); skcipher_request_free(req); - if (res < 0) + if (res < 0) { printk_ratelimited(KERN_ERR "%s: Error (error code %d)\n", __func__, res); + return res; + } - oname->len = ciphertext_len; - return res; + oname->len = cryptlen; + return 0; } -/* - * fname_decrypt() - * This function decrypts the input filename, and returns - * the length of the plaintext. - * Errors are returned as negative numbers. - * We trust the caller to allocate sufficient memory to oname string. +/** + * fname_decrypt() - decrypt a filename + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ static int fname_decrypt(struct inode *inode, const struct fscrypt_str *iname, @@ -146,7 +131,7 @@ static int fname_decrypt(struct inode *inode, } skcipher_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - dir_crypt_complete, &ecr); + fname_crypt_complete, &ecr); /* Initialize IV */ memset(iv, 0, FS_CRYPTO_BLOCK_SIZE); @@ -168,7 +153,7 @@ static int fname_decrypt(struct inode *inode, } oname->len = strnlen(oname->name, iname->len); - return oname->len; + return 0; } static const char *lookup_table = @@ -231,9 +216,8 @@ u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) if (ci) padding = 4 << (ci->ci_flags & FS_POLICY_FLAGS_PAD_MASK); - if (ilen < FS_CRYPTO_BLOCK_SIZE) - ilen = FS_CRYPTO_BLOCK_SIZE; - return size_round_up(ilen, padding); + ilen = max(ilen, (u32)FS_CRYPTO_BLOCK_SIZE); + return round_up(ilen, padding); } EXPORT_SYMBOL(fscrypt_fname_encrypted_size); @@ -279,6 +263,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); /** * fscrypt_fname_disk_to_usr() - converts a filename from disk space to user * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, u32 hash, u32 minor_hash, @@ -287,13 +275,12 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, { const struct qstr qname = FSTR_TO_QSTR(iname); char buf[24]; - int ret; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (iname->len < FS_CRYPTO_BLOCK_SIZE) @@ -303,9 +290,9 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, return fname_decrypt(inode, iname, oname); if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { - ret = digest_encode(iname->name, iname->len, oname->name); - oname->len = ret; - return ret; + oname->len = digest_encode(iname->name, iname->len, + oname->name); + return 0; } if (hash) { memcpy(buf, &hash, 4); @@ -315,15 +302,18 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, } memcpy(buf + 8, iname->name + iname->len - 16, 16); oname->name[0] = '_'; - ret = digest_encode(buf, 24, oname->name + 1); - oname->len = ret + 1; - return ret + 1; + oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); /** * fscrypt_fname_usr_to_disk() - converts a filename from user space to disk * space + * + * The caller must have allocated sufficient memory for the @oname string. + * + * Return: 0 on success, -errno on failure */ int fscrypt_fname_usr_to_disk(struct inode *inode, const struct qstr *iname, @@ -333,7 +323,7 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, oname->name[0] = '.'; oname->name[iname->len - 1] = '.'; oname->len = iname->len; - return oname->len; + return 0; } if (inode->i_crypt_info) return fname_encrypt(inode, iname, oname); @@ -367,10 +357,10 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, if (dir->i_crypt_info) { ret = fscrypt_fname_alloc_buffer(dir, iname->len, &fname->crypto_buf); - if (ret < 0) + if (ret) return ret; ret = fname_encrypt(dir, iname, &fname->crypto_buf); - if (ret < 0) + if (ret) goto errout; fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 1ac263eddc4e..67fb6d8876d0 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -8,11 +8,8 @@ * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. */ -#include #include -#include #include -#include #include static void derive_crypt_complete(struct crypto_async_request *req, int rc) @@ -139,6 +136,38 @@ static int validate_user_key(struct fscrypt_info *crypt_info, return res; } +static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, + const char **cipher_str_ret, int *keysize_ret) +{ + if (S_ISREG(inode->i_mode)) { + if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { + *cipher_str_ret = "xts(aes)"; + *keysize_ret = FS_AES_256_XTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported contents encryption mode " + "%d for inode %lu\n", + ci->ci_data_mode, inode->i_ino); + return -ENOKEY; + } + + if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { + *cipher_str_ret = "cts(cbc(aes))"; + *keysize_ret = FS_AES_256_CTS_KEY_SIZE; + return 0; + } + pr_warn_once("fscrypto: unsupported filenames encryption mode " + "%d for inode %lu\n", + ci->ci_filename_mode, inode->i_ino); + return -ENOKEY; + } + + pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", + (inode->i_mode & S_IFMT), inode->i_ino); + return -ENOKEY; +} + static void put_crypt_info(struct fscrypt_info *ci) { if (!ci) @@ -155,8 +184,8 @@ int get_crypt_info(struct inode *inode) struct fscrypt_context ctx; struct crypto_skcipher *ctfm; const char *cipher_str; - u8 raw_key[FS_MAX_KEY_SIZE]; - u8 mode; + int keysize; + u8 *raw_key = NULL; int res; res = fscrypt_initialize(); @@ -179,13 +208,19 @@ int get_crypt_info(struct inode *inode) if (res < 0) { if (!fscrypt_dummy_context_enabled(inode)) return res; + ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; ctx.flags = 0; } else if (res != sizeof(ctx)) { return -EINVAL; } - res = 0; + + if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) + return -EINVAL; + + if (ctx.flags & ~FS_POLICY_FLAGS_VALID) + return -EINVAL; crypt_info = kmem_cache_alloc(fscrypt_info_cachep, GFP_NOFS); if (!crypt_info) @@ -198,27 +233,20 @@ int get_crypt_info(struct inode *inode) crypt_info->ci_keyring_key = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "%s: unsupported key mode %d (ino %u)\n", - __func__, mode, (unsigned) inode->i_ino); - res = -ENOKEY; + res = determine_cipher_type(crypt_info, inode, &cipher_str, &keysize); + if (res) goto out; - } + + /* + * This cannot be a stack buffer because it is passed to the scatterlist + * crypto API as part of key derivation. + */ + res = -ENOMEM; + raw_key = kmalloc(FS_MAX_KEY_SIZE, GFP_NOFS); + if (!raw_key) + goto out; + if (fscrypt_dummy_context_enabled(inode)) { memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); goto got_key; @@ -253,11 +281,12 @@ int get_crypt_info(struct inode *inode) crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_skcipher_setkey(ctfm, raw_key, fscrypt_key_size(mode)); + res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); + raw_key = NULL; if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { put_crypt_info(crypt_info); goto retry; @@ -268,7 +297,7 @@ int get_crypt_info(struct inode *inode) if (res == -ENOKEY) res = 0; put_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); + kzfree(raw_key); return res; } diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index ed115acb5dee..6865663aac69 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -109,6 +109,8 @@ int fscrypt_process_policy(struct file *filp, if (ret) return ret; + inode_lock(inode); + if (!inode_has_encryption_context(inode)) { if (!S_ISDIR(inode->i_mode)) ret = -EINVAL; @@ -127,6 +129,8 @@ int fscrypt_process_policy(struct file *filp, ret = -EINVAL; } + inode_unlock(inode); + mnt_drop_write_file(filp); return ret; } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5594667c2f41..210082783d5a 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -814,12 +814,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (f2fs_encrypted_inode(d->inode)) { int save_len = fstr->len; - int ret; + int err; - ret = fscrypt_fname_disk_to_usr(d->inode, + err = fscrypt_fname_disk_to_usr(d->inode, (u32)de->hash_code, 0, &de_name, fstr); - if (ret < 0) + if (err) return true; de_name = *fstr; diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 7f2fdb154180..468b2dbe6d34 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -451,7 +451,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, ostr.name = sd->encrypted_path; ostr.len = disk_link.len; err = fscrypt_fname_usr_to_disk(inode, &istr, &ostr); - if (err < 0) + if (err) goto err_out; sd->len = cpu_to_le16(ostr.len); @@ -1047,7 +1047,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook goto errout; res = fscrypt_fname_disk_to_usr(inode, 0, 0, &cstr, &pstr); - if (res < 0) + if (res) goto errout; /* this is broken symlink case */ @@ -1059,7 +1059,7 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook paddr = pstr.name; /* Null-terminate the name */ - paddr[res] = '\0'; + paddr[pstr.len] = '\0'; put_page(cpage); return *cookie = paddr; diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index 76cff18bb032..ff8b11b26f31 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -111,23 +111,6 @@ struct fscrypt_completion_result { struct fscrypt_completion_result ecr = { \ COMPLETION_INITIALIZER((ecr).completion), 0 } -static inline int fscrypt_key_size(int mode) -{ - switch (mode) { - case FS_ENCRYPTION_MODE_AES_256_XTS: - return FS_AES_256_XTS_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_GCM: - return FS_AES_256_GCM_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CBC: - return FS_AES_256_CBC_KEY_SIZE; - case FS_ENCRYPTION_MODE_AES_256_CTS: - return FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - #define FS_FNAME_NUM_SCATTER_ENTRIES 4 #define FS_CRYPTO_BLOCK_SIZE 16 #define FS_FNAME_CRYPTO_DIGEST_SIZE 32 @@ -202,13 +185,6 @@ static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); } -static inline u32 fscrypt_validate_encryption_key_size(u32 mode, u32 size) -{ - if (size == fscrypt_key_size(mode)) - return size; - return 0; -} - static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) { if (str->len == 1 && str->name[0] == '.') From 5fee1505086b7686fe9d2a47186237976b34dd36 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 29 Oct 2016 18:46:34 +0800 Subject: [PATCH 431/546] f2fs: report error of f2fs_fill_dentries commit ed6bd4b146527e7c6934e3582c47d7b857802676 upstream. Report error of f2fs_fill_dentries to ->iterate_shared, otherwise when error ocurrs, user may just list part of dirents in target directory without any hints. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/dir.c | 21 ++++++++++++--------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/inline.c | 6 ++++-- 3 files changed, 17 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 210082783d5a..4436079dbf0c 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -785,7 +785,7 @@ bool f2fs_empty_dir(struct inode *dir) return true; } -bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr) { unsigned char d_type = DT_UNKNOWN; @@ -820,7 +820,7 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, (u32)de->hash_code, 0, &de_name, fstr); if (err) - return true; + return err; de_name = *fstr; fstr->len = save_len; @@ -828,12 +828,12 @@ bool f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, if (!dir_emit(ctx, de_name.name, de_name.len, le32_to_cpu(de->ino), d_type)) - return true; + return 1; bit_pos += GET_DENTRY_SLOTS(le16_to_cpu(de->name_len)); ctx->pos = start_pos + bit_pos; } - return false; + return 0; } static int f2fs_readdir(struct file *file, struct dir_context *ctx) @@ -872,17 +872,21 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_page = get_lock_data_page(inode, n, false); if (IS_ERR(dentry_page)) { err = PTR_ERR(dentry_page); - if (err == -ENOENT) + if (err == -ENOENT) { + err = 0; continue; - else + } else { goto out; + } } dentry_blk = kmap(dentry_page); make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); - if (f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr)) { + err = f2fs_fill_dentries(ctx, &d, + n * NR_DENTRY_IN_BLOCK, &fstr); + if (err) { kunmap(dentry_page); f2fs_put_page(dentry_page, 1); break; @@ -892,10 +896,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) kunmap(dentry_page); f2fs_put_page(dentry_page, 1); } - err = 0; out: fscrypt_fname_free_buffer(&fstr); - return err; + return err < 0 ? err : 0; } static int f2fs_dir_open(struct inode *inode, struct file *filp) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 932c53f441db..4b13d70d716c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2014,7 +2014,7 @@ void set_de_type(struct f2fs_dir_entry *, umode_t); unsigned char get_de_type(struct f2fs_dir_entry *); struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -bool f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, +int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, unsigned int, struct fscrypt_str *); void do_make_empty_dir(struct inode *, struct inode *, struct f2fs_dentry_ptr *); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b859d5e377ae..b85987703d1e 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -629,6 +629,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + int err; if (ctx->pos == NR_INLINE_DENTRY) return 0; @@ -641,11 +642,12 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); - if (!f2fs_fill_dentries(ctx, &d, 0, fstr)) + err = f2fs_fill_dentries(ctx, &d, 0, fstr); + if (!err) ctx->pos = NR_INLINE_DENTRY; f2fs_put_page(ipage, 1); - return 0; + return err < 0 ? err : 0; } int f2fs_inline_data_fiemap(struct inode *inode, From f0e97e17a9413f3f6e2007fe4f13dc1d46be51cc Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 31 Oct 2016 14:01:41 -0700 Subject: [PATCH 432/546] f2fs: avoid infinite loop in the EIO case on recover_orphan_inodes commit 099228000eff6b25e0f76b276043cd65cd4eba5a upstream. This patch should fix an infinite loop case below. F2FS-fs : inject IO error in f2fs_read_end_io+0xf3/0x120 [f2fs] F2FS-fs (nvme0n1p1): recover_orphan_inode: orphan failed (ino=39ac1a), run fsck to fix. ... [] sync_meta_pages+0xae/0x270 [f2fs] [] ? flush_sit_entries+0x8d/0x960 [f2fs] [] write_checkpoint+0x361/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] f2fs_balance_fs_bg+0x7e/0x1c0 [f2fs] [] f2fs_write_node_pages+0x34/0x320 [f2fs] [] do_writepages+0x21/0x30 [] __writeback_single_inode+0x61/0x760 [] ? _raw_spin_unlock+0x27/0x40 [] writeback_single_inode+0xd5/0x190 [] write_inode_now+0x99/0xc0 [] iput+0x1f6/0x2c0 [] f2fs_fill_super+0xe0e/0x1300 [f2fs] [] ? sget_userns+0x4f4/0x530 [] mount_bdev+0x182/0x1b0 [] ? f2fs_commit_super+0x100/0x100 [f2fs] [] f2fs_mount+0x15/0x20 [f2fs] [] mount_fs+0x38/0x170 [] vfs_kern_mount+0x6b/0x160 [] do_mount+0x1be/0xd60 [] ? copy_mount_options+0xb7/0x220 [] SyS_mount+0x94/0xd0 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6034d51fc5fc..e007c011ec53 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1890,6 +1890,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); free_nm: From cc2b01c544cbaeb09b45baa71d22f25b7ab6d605 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:44:59 +0900 Subject: [PATCH 433/546] f2fs: Add missing break in switch-case commit 487df616dec33231c99294b906d720d256a2de16 upstream. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e007c011ec53..4fd34e7bcf60 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -420,6 +420,7 @@ static int parse_options(struct super_block *sb, char *options) break; case Opt_nodiscard: clear_opt(sbi, DISCARD); + break; case Opt_noheap: set_opt(sbi, NOHEAP); break; From 2b5864b28ea41dd205c0b549927529860ccfcac3 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:00 +0900 Subject: [PATCH 434/546] f2fs: Use generic zoned block device terminology commit 0bfd7a091c19132489a0f977b8dbf9f6b5ae0a1c upstream. SMR stands for "Shingled Magnetic Recording" which makes sense only for hard disk drives (spinning rust). The ZBC/ZAC standards enable management of SMR disks, but solid state drives may also support those standards. So rename the HMSMR feature to BLKZONED to avoid a HDD centric terminology. For the same reason, rename f2fs_sb_mounted_hmsmr to f2fs_sb_mounted_blkzoned. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f03c34d3797f..a60efb8fdd3a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -111,7 +111,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, { if (!is_read_io(rw)) { atomic_inc(&sbi->nr_wb_bios); - if (f2fs_sb_mounted_hmsmr(sbi->sb) && + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4b13d70d716c..5a563a9f3a52 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -103,7 +103,7 @@ struct f2fs_mount_info { }; #define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_HMSMR 0x0002 +#define F2FS_FEATURE_BLKZONED 0x0002 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) @@ -2470,9 +2470,9 @@ static inline int f2fs_sb_has_crypto(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_ENCRYPT); } -static inline int f2fs_sb_mounted_hmsmr(struct super_block *sb) +static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) { - return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_HMSMR); + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4fd34e7bcf60..3574d0620dc4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -974,7 +974,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); - if (f2fs_sb_mounted_hmsmr(sbi->sb)) { + if (f2fs_sb_mounted_blkzoned(sbi->sb)) { set_opt_mode(sbi, F2FS_MOUNT_LFS); set_opt(sbi, DISCARD); } else { From 0c0d7e6e2e467716eef7a266f2ff87337744fc26 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:01 +0900 Subject: [PATCH 435/546] f2fs: Check zoned block feature for host-managed zoned block devices commit d1b959c8770260b611b9a1f0c5e8b12b7cb5b9d2 upstream. The F2FS_FEATURE_BLKZONED feature indicates that the drive was formatted with zone alignment optimization. This is optional for host-aware devices, but mandatory for host-managed zoned block devices. So check that the feature is set in this latter case. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3574d0620dc4..4187e3b9a83e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1639,6 +1639,26 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* + * The BLKZONED feature indicates that the drive was formatted with + * zone alignment optimization. This is optional for host-aware + * devices, but mandatory for host-managed zoned block devices. + */ +#ifndef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device support is not enabled\n"); + goto free_sb_buf; + } +#else + if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + goto free_sb_buf; + } +#endif + default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); From b6fa8c2507f6fdac91da7bec0ed50c0c5720281f Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:02 +0900 Subject: [PATCH 436/546] f2fs: Suppress discard warning message for zoned block devices commit 0ab0299835738cd407569401da1fef4c97b4419c upstream. For zoned block devices, discard is replaced by zone reset. So do not warn if the device does not supports discard. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 4187e3b9a83e..3e57ec837de9 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -412,7 +412,7 @@ static int parse_options(struct super_block *sb, char *options) q = bdev_get_queue(sb->s_bdev); if (blk_queue_discard(q)) { set_opt(sbi, DISCARD); - } else { + } else if (!f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but " "the device does not support discard"); From dae3ed51548e96759db90694fcf8d74dbe0d810e Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:03 +0900 Subject: [PATCH 437/546] f2fs: Always enable discard for zoned blocks devices commit 96ba2decb4241aa2c6b61cfc8489d648769eff99 upstream. Zone write pointer reset acts as discard for zoned block devices. So if the zoned block device feature is enabled, always declare that discard is enabled, even if the device does not actually support the command. For the same reason, prevent the use the "nodicard" mount option. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 14 +++++++------- fs/f2fs/super.c | 5 +++++ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 5a563a9f3a52..4fd31208965d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1159,13 +1159,6 @@ static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) spin_unlock(&sbi->cp_lock); } -static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) -{ - struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); - - return blk_queue_discard(q); -} - static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { down_read(&sbi->cp_rwsem); @@ -2475,6 +2468,13 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) +{ + struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); + + return blk_queue_discard(q) || f2fs_sb_mounted_blkzoned(sbi->sb); +} + static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt) { clear_opt(sbi, ADAPTIVE); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3e57ec837de9..33676c1e35d7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -419,6 +419,11 @@ static int parse_options(struct super_block *sb, char *options) } break; case Opt_nodiscard: + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "discard is required for zoned block devices"); + return -EINVAL; + } clear_opt(sbi, DISCARD); break; case Opt_noheap: From bb17ca7bfd514f11d5038d7bdb7a1a30ca41b985 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:04 +0900 Subject: [PATCH 438/546] f2fs: Do not allow adaptive mode for host-managed zoned block devices commit 3adc57e97792e4ac9f228bde802829e2e9840afe upstream. The LFS mode is mandatory for host-managed zoned block devices as update in place optimizations are not possible for segments in sequential zones. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 33676c1e35d7..6bc0810969b7 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -518,6 +518,13 @@ static int parse_options(struct super_block *sb, char *options) return -ENOMEM; if (strlen(name) == 8 && !strncmp(name, "adaptive", 8)) { + if (f2fs_sb_mounted_blkzoned(sb)) { + f2fs_msg(sb, KERN_WARNING, + "adaptive mode is not allowed with " + "zoned block device feature"); + kfree(name); + return -EINVAL; + } set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE); } else if (strlen(name) == 3 && !strncmp(name, "lfs", 3)) { From 72f2351ba720567cd098434e38d73a3e31e060cc Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:05 +0900 Subject: [PATCH 439/546] f2fs: Cache zoned block devices zone type commit 178053e2f1f9ccdb61ff6c2bd8644b53fc98e72e upstream. With the zoned block device feature enabled, section discard need to do a zone reset for sections contained in sequential zones, and a regular discard (if supported) for sections stored in conventional zones. Avoid the need for a costly report zones to obtain a section zone type when discarding it by caching the types of the device zones in the super block information. This cache is initialized at mount time for mounts with the zoned block device feature enabled. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 18 +++++++++++++ fs/f2fs/super.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 90 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4fd31208965d..c6dba704b0fe 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -803,6 +803,14 @@ struct f2fs_sb_info { u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; u8 key_prefix_size; #endif + +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + unsigned int blocks_per_blkz; /* F2FS blocks per zone */ + unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ + u8 *blkz_type; /* Array of zones type */ +#endif + /* for node-related operations */ struct f2fs_nm_info *nm_info; /* node manager */ struct inode *node_inode; /* cache node blocks */ @@ -2468,6 +2476,16 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +#ifdef CONFIG_BLK_DEV_ZONED +static inline int get_blkz_type(struct f2fs_sb_info *sbi, + block_t blkaddr) +{ + unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + + return sbi->blkz_type[zno]; +} +#endif + static inline bool f2fs_discard_en(struct f2fs_sb_info *sbi) { struct request_queue *q = bdev_get_queue(sbi->sb->s_bdev); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 6bc0810969b7..d777a18df958 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1512,6 +1512,65 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) GFP_KERNEL); } +#ifdef CONFIG_BLK_DEV_ZONED +static int init_blkz_info(struct f2fs_sb_info *sbi) +{ + struct block_device *bdev = sbi->sb->s_bdev; + sector_t nr_sectors = bdev->bd_part->nr_sects; + sector_t sector = 0; + struct blk_zone *zones; + unsigned int i, nr_zones; + unsigned int n = 0; + int err = -EIO; + + if (!f2fs_sb_mounted_blkzoned(sbi->sb)) + return 0; + + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); + sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; + if (nr_sectors & (bdev_zone_size(bdev) - 1)) + sbi->nr_blkz++; + + sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); + if (!sbi->blkz_type) + return -ENOMEM; + +#define F2FS_REPORT_NR_ZONES 4096 + + zones = kcalloc(F2FS_REPORT_NR_ZONES, sizeof(struct blk_zone), + GFP_KERNEL); + if (!zones) + return -ENOMEM; + + /* Get block zones type */ + while (zones && sector < nr_sectors) { + + nr_zones = F2FS_REPORT_NR_ZONES; + err = blkdev_report_zones(bdev, sector, + zones, &nr_zones, + GFP_KERNEL); + if (err) + break; + if (!nr_zones) { + err = -EIO; + break; + } + + for (i = 0; i < nr_zones; i++) { + sbi->blkz_type[n] = zones[i].type; + sector += zones[i].len; + n++; + } + } + + kfree(zones); + + return err; +} +#endif + /* * Read f2fs raw super block. * Because we have two copies of super block, so read both of them @@ -1758,6 +1817,15 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_ino_entry_info(sbi); +#ifdef CONFIG_BLK_DEV_ZONED + err = init_blkz_info(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + goto free_blkz; + } +#endif + /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -1936,6 +2004,10 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); +#ifdef CONFIG_BLK_DEV_ZONED +free_blkz: + kfree(sbi->blkz_type); +#endif kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); From 5558039d0b3f92b889cff285296d48546bbd117c Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:06 +0900 Subject: [PATCH 440/546] f2fs: Reset sequential zones on zoned block devices commit f46e8809e88d113ba096bcfc772b5182ce00b941 upstream. When a zoned block device is mounted, discarding sections contained in sequential zones must reset the zone write pointer. For sections contained in conventional zones, the regular discard is used if the drive supports it. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 45 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 45 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ec4d74c26067..8e4863bd36f5 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,6 +16,7 @@ #include #include #include +#include #include "f2fs.h" #include "segment.h" @@ -584,6 +585,45 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +#ifdef CONFIG_BLK_DEV_ZONED +static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t sector = SECTOR_FROM_BLOCK(blkstart); + sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); + struct block_device *bdev = sbi->sb->s_bdev; + + if (nr_sects != bdev_zone_size(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "Unaligned discard attempted (sector %llu + %llu)", + (unsigned long long)sector, + (unsigned long long)nr_sects); + return -EIO; + } + + /* + * We need to know the type of the zone: for conventional zones, + * use regular discard if the drive supports it. For sequential + * zones, reset the zone write pointer. + */ + switch (get_blkz_type(sbi, blkstart)) { + + case BLK_ZONE_TYPE_CONVENTIONAL: + if (!blk_queue_discard(bdev_get_queue(bdev))) + return 0; + return blkdev_issue_discard(bdev, sector, nr_sects, + GFP_NOFS, 0); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + case BLK_ZONE_TYPE_SEQWRITE_PREF: + return blkdev_reset_zones(bdev, sector, + nr_sects, GFP_NOFS); + default: + /* Unknown zone type: broken device ? */ + return -EIO; + } +} +#endif + static int f2fs_issue_discard(struct f2fs_sb_info *sbi, block_t blkstart, block_t blklen) { @@ -601,6 +641,11 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, sbi->discard_blks--; } trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb)) + return f2fs_issue_discard_zone(sbi, blkstart, blklen); +#endif return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); } From 12974f3c15b09b9c63dbf96ecd68ab099db60c09 Mon Sep 17 00:00:00 2001 From: Damien Le Moal Date: Fri, 28 Oct 2016 17:45:07 +0900 Subject: [PATCH 441/546] f2fs: Trace reset zone events commit 126606c7a99b32ba8265a51fab01533fe40c9ecc upstream. Similarly to the regular discard, trace zone reset events. Signed-off-by: Damien Le Moal Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 + include/trace/events/f2fs.h | 21 +++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 8e4863bd36f5..06b9d16a19f6 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -615,6 +615,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, GFP_NOFS, 0); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: + trace_f2fs_issue_reset_zone(sbi->sb, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 3a09bb4dc3b2..90d6ad49a9c5 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -1110,6 +1110,27 @@ TRACE_EVENT(f2fs_issue_discard, (unsigned long long)__entry->blklen) ); +TRACE_EVENT(f2fs_issue_reset_zone, + + TP_PROTO(struct super_block *sb, block_t blkstart), + + TP_ARGS(sb, blkstart), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(block_t, blkstart) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->blkstart = blkstart; + ), + + TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", + show_dev(__entry), + (unsigned long long)__entry->blkstart) +); + TRACE_EVENT(f2fs_issue_flush, TP_PROTO(struct super_block *sb, unsigned int nobarrier, From f8cd7c2937bf9687f8bb04ad07412175e7de7de2 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 2 Nov 2016 20:43:21 +0800 Subject: [PATCH 442/546] f2fs: record inode updating status correctly commit 60dcedc9972d136fab1600c919dd32080bcc26f7 upstream. We should record updating status of inode only for living inode, for those unlinked inode it needs to clear its ino cache, otherwise after the ino was been reused, it will cause unneeded node page writing during ->fsync. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/inode.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 7b5e402f0a72..af06bda51a54 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -377,6 +377,9 @@ void f2fs_evict_inode(struct inode *inode) goto no_delete; #endif + remove_ino_entry(sbi, inode->i_ino, APPEND_INO); + remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); + sb_start_intwrite(inode->i_sb); set_inode_flag(inode, FI_NO_ALLOC); i_size_write(inode, 0); @@ -409,10 +412,12 @@ void f2fs_evict_inode(struct inode *inode) invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); - if (is_inode_flag_set(inode, FI_APPEND_WRITE)) - add_ino_entry(sbi, inode->i_ino, APPEND_INO); - if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) - add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + if (inode->i_nlink) { + if (is_inode_flag_set(inode, FI_APPEND_WRITE)) + add_ino_entry(sbi, inode->i_ino, APPEND_INO); + if (is_inode_flag_set(inode, FI_UPDATE_WRITE)) + add_ino_entry(sbi, inode->i_ino, UPDATE_INO); + } if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); From 72982ad484a70be18f79df9dd7e59669e93a88b9 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 4 Nov 2016 00:26:55 +0800 Subject: [PATCH 443/546] f2fs: fix wrong i_atime recovery commit 9f0552e078b8a25fcf9c3950a05ea1398616d2b9 upstream. Shouldn't update in-memory i_atime with on-disk i_mtime of inode when recovering inode. Shuoran found this bug which is hidden for a long time, honour is belong to him. Signed-off-by: Shuoran Liu Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index ba7372f7f95d..612a1e98625d 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -180,10 +180,10 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_mode = le16_to_cpu(raw->i_mode); f2fs_i_size_write(inode, le64_to_cpu(raw->i_size)); - inode->i_atime.tv_sec = le64_to_cpu(raw->i_mtime); + inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime); inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime); - inode->i_atime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec); inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); From c1c304e68e50e7f95d4486732db601f6377a844e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:33:57 -0700 Subject: [PATCH 444/546] f2fs: assign segments correctly for direct_io commit bdb7d964c4fb73078ab21c20e8c7b7bf7e641bb6 upstream. Previously, we assigned CURSEG_WARM_DATA for direct_io, but if we have two or four logs, we do not use that type at all. Let's fix it. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 06b9d16a19f6..4bdf1191a36f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1422,8 +1422,12 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct curseg_info *curseg; bool direct_io = (type == CURSEG_DIRECT_IO); - type = direct_io ? CURSEG_WARM_DATA : type; - + if (direct_io) { + if (sbi->active_logs <= 4) + type = CURSEG_HOT_DATA; + else + type = CURSEG_WARM_DATA; + } curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); From 1f74fc2b10e321e158d525cd44352f60a10eb557 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 4 Nov 2016 14:59:15 -0700 Subject: [PATCH 445/546] f2fs: remove checkpoint in f2fs_freeze commit b4b9d34c855ef383402cd1acefb1e33a515ae5f5 upstream. The generic freeze_super() calls sync_filesystems() before f2fs_freeze(). So, basically we don't need to do checkpoint in f2fs_freeze(). But, in xfs/068, it triggers circular locking problem below due to gc_mutex for checkpoint. ====================================================== [ INFO: possible circular locking dependency detected ] 4.9.0-rc1+ #132 Tainted: G OE ------------------------------------------------------- 1. wait for __sb_start_write() by [] dump_stack+0x85/0xc2 [] print_circular_bug+0x1cf/0x230 [] __lock_acquire+0x19e0/0x1bc0 [] lock_acquire+0x11b/0x220 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] __sb_start_write+0x130/0x200 [] ? f2fs_drop_inode+0x9b/0x160 [f2fs] [] f2fs_drop_inode+0x9b/0x160 [f2fs] [] iput+0x171/0x2c0 [] f2fs_sync_inode_meta+0x3f/0xf0 [f2fs] [] block_operations+0x84/0x110 [f2fs] [] write_checkpoint+0xe8/0xf20 [f2fs] [] ? trace_hardirqs_on+0xd/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] ? sched_clock+0x9/0x10 [] ? f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_sync_fs+0x85/0x190 [f2fs] [] ? do_fsync+0x70/0x70 [] ? do_fsync+0x70/0x70 [] sync_fs_one_sb+0x20/0x30 [] iterate_supers+0xae/0x100 [] sys_sync+0x55/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 2. wait for sbi->gc_mutex by [] lock_acquire+0x11b/0x220 [] mutex_lock_nested+0x76/0x3f0 [] f2fs_sync_fs+0x79/0x190 [f2fs] [] f2fs_freeze+0x1c/0x20 [f2fs] [] freeze_super+0xcf/0x190 [] do_vfs_ioctl+0x53c/0x6a0 [] SyS_ioctl+0x79/0x90 [] entry_SYSCALL_64_fastpath+0x23/0xc6 Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index d777a18df958..7a0634b0bee8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -800,13 +800,17 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { - int err; - if (f2fs_readonly(sb)) return 0; - err = f2fs_sync_fs(sb, 1); - return err; + /* IO error happened before */ + if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + return -EIO; + + /* must be clean, since sync_filesystem() was already called */ + if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + return -EINVAL; + return 0; } static int f2fs_unfreeze(struct super_block *sb) @@ -2153,3 +2157,4 @@ module_exit(exit_f2fs_fs) MODULE_AUTHOR("Samsung Electronics's Praesto Team"); MODULE_DESCRIPTION("Flash Friendly File System"); MODULE_LICENSE("GPL"); + From f762da7ab57dc5214ed08fffd574dc3a849bbb15 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 5 Nov 2016 11:12:40 +0800 Subject: [PATCH 446/546] Revert "f2fs: do not recover from previous remained wrong dnodes" commit d47b8715953ad0cf82bb0a9d30d7b11d83bc9c11 upstream. i_times of inode will be set with current system time which can be configured through 'date', so it's not safe to judge dnode block as garbage data or unchanged inode depend on i_times. Now, we have used enhanced 'cp_ver + cp' crc method to verify valid dnode block, so I expect recoverying invalid dnode is almost not possible. This reverts commit 807b1e1c8e08452948495b1a9985ab46d329e5c2. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 31 +------------------------------ 1 file changed, 1 insertion(+), 30 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 612a1e98625d..1d25768dca33 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -196,32 +196,6 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static bool is_same_inode(struct inode *inode, struct page *ipage) -{ - struct f2fs_inode *ri = F2FS_INODE(ipage); - struct timespec disk; - - if (!IS_INODE(ipage)) - return true; - - disk.tv_sec = le64_to_cpu(ri->i_ctime); - disk.tv_nsec = le32_to_cpu(ri->i_ctime_nsec); - if (timespec_compare(&inode->i_ctime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_atime); - disk.tv_nsec = le32_to_cpu(ri->i_atime_nsec); - if (timespec_compare(&inode->i_atime, &disk) > 0) - return false; - - disk.tv_sec = le64_to_cpu(ri->i_mtime); - disk.tv_nsec = le32_to_cpu(ri->i_mtime_nsec); - if (timespec_compare(&inode->i_mtime, &disk) > 0) - return false; - - return true; -} - static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) { struct curseg_info *curseg; @@ -248,10 +222,7 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) goto next; entry = get_fsync_inode(head, ino_of_node(page)); - if (entry) { - if (!is_same_inode(entry->inode, page)) - goto next; - } else { + if (!entry) { if (IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) From c36393b43a8ad7a8b8c9aa883d4d85fe2a5269f5 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 7 Nov 2016 21:22:31 +0800 Subject: [PATCH 447/546] f2fs: return directly if block has been removed from the victim commit 206147112860418fa9363cf94ec2c172bdf4313a upstream. If one block has been to written to a new place, just return in move data process. This patch check it again with holding page lock. Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/gc.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 72a0ca08f901..744031194934 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -544,7 +544,8 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx) +static void move_encrypted_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -564,6 +565,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) if (!page) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -643,7 +647,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx) f2fs_put_page(page, 1); } -static void move_data_page(struct inode *inode, block_t bidx, int gc_type) +static void move_data_page(struct inode *inode, block_t bidx, int gc_type, + unsigned int segno, int off) { struct page *page; @@ -651,6 +656,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type) if (IS_ERR(page)) return; + if (!check_valid_map(F2FS_I_SB(inode), segno, off)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; @@ -792,9 +800,9 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx); + move_encrypted_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type); + move_data_page(inode, start_bidx, gc_type, segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); From b55145a8ee43e63d0f32bb74598755323a63d357 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:31:40 -0800 Subject: [PATCH 448/546] f2fs: revert segment allocation for direct IO commit 6ae1be13e85f4c42c8ca371fda50ae39eebbfd96 upstream. Now we don't need to be too much careful about storage alignment for dio, since its speed becomes quite fast and we'd better avoid any misalignment first. Revert: 38aa0889b250 (f2fs: align direct_io'ed data to section) Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 6 +----- fs/f2fs/f2fs.h | 1 - fs/f2fs/segment.c | 36 +++++++++--------------------------- 3 files changed, 10 insertions(+), 33 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index a60efb8fdd3a..beb747132858 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -583,7 +583,6 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct f2fs_summary sum; struct node_info ni; - int seg = CURSEG_WARM_DATA; pgoff_t fofs; blkcnt_t count = 1; @@ -601,11 +600,8 @@ static int __allocate_data_block(struct dnode_of_data *dn) get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); - if (dn->ofs_in_node == 0 && dn->inode_page == dn->node_page) - seg = CURSEG_DIRECT_IO; - allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, seg); + &sum, CURSEG_WARM_DATA); set_data_blkaddr(dn); /* update i_size */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c6dba704b0fe..4d4bfbaeb788 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -639,7 +639,6 @@ enum { CURSEG_WARM_NODE, /* direct node blocks of normal files */ CURSEG_COLD_NODE, /* indirect node blocks */ NO_CHECK_TYPE, - CURSEG_DIRECT_IO, /* to use for the direct IO path */ }; struct flush_cmd { diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4bdf1191a36f..19ab2e63d8d7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1267,25 +1267,21 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, stat_inc_seg_type(sbi, curseg); } -static void __allocate_new_segments(struct f2fs_sb_info *sbi, int type) -{ - struct curseg_info *curseg = CURSEG_I(sbi, type); - unsigned int old_segno; - - old_segno = curseg->segno; - SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); - locate_dirty_segment(sbi, old_segno); -} - void allocate_new_segments(struct f2fs_sb_info *sbi) { + struct curseg_info *curseg; + unsigned int old_segno; int i; if (test_opt(sbi, LFS)) return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segments(sbi, i); + for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { + curseg = CURSEG_I(sbi, i); + old_segno = curseg->segno; + SIT_I(sbi)->s_ops->allocate_segment(sbi, i, true); + locate_dirty_segment(sbi, old_segno); + } } static const struct segment_allocation default_salloc_ops = { @@ -1419,25 +1415,11 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_summary *sum, int type) { struct sit_info *sit_i = SIT_I(sbi); - struct curseg_info *curseg; - bool direct_io = (type == CURSEG_DIRECT_IO); - - if (direct_io) { - if (sbi->active_logs <= 4) - type = CURSEG_HOT_DATA; - else - type = CURSEG_WARM_DATA; - } - curseg = CURSEG_I(sbi, type); + struct curseg_info *curseg = CURSEG_I(sbi, type); mutex_lock(&curseg->curseg_mutex); mutex_lock(&sit_i->sentry_lock); - /* direct_io'ed data is aligned to the segment for better performance */ - if (direct_io && curseg->next_blkoff && - !has_not_enough_free_secs(sbi, 0, 0)) - __allocate_new_segments(sbi, type); - *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); /* From 983624348fccbbeebac8c87e9b07bd6f89278f04 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 12:08:22 -0800 Subject: [PATCH 449/546] f2fs: allow dio read for LFS mode commit e57e9ae5b179a6b243c42bf6d9549d1595c27089 upstream. We can allow dio reads for LFS mode, while doing buffered writes for dio writes. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index beb747132858..f59c91765b1d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1735,7 +1735,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) return 0; - if (test_opt(F2FS_I_SB(inode), LFS)) + if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; if (trace_android_fs_dataread_start_enabled() && From 88d97b16ec19356a0c41b290fdd398c878ff03ba Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 6 Oct 2016 19:02:05 -0700 Subject: [PATCH 450/546] f2fs: support multiple devices commit 3c62be17d4f562f43fe1d03b48194399caa35aa5 upstream. This patch implements multiple devices support for f2fs. Given multiple devices by mkfs.f2fs, f2fs shows them entirely as one big volume under one f2fs instance. Internal block management is very simple, but we will modify block allocation and background GC policy to boost IO speed by exploiting them accoording to each device speed. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 ++++++++++++++-- fs/f2fs/f2fs.h | 29 +++++++-- fs/f2fs/segment.c | 112 +++++++++++++++++++++++--------- fs/f2fs/super.c | 138 +++++++++++++++++++++++++++++++--------- include/linux/f2fs_fs.h | 10 ++- 5 files changed, 274 insertions(+), 70 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f59c91765b1d..31e9c64e0f1d 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -88,6 +88,46 @@ static void f2fs_write_end_io(struct bio *bio) bio_put(bio); } +/* + * Return true, if pre_bio's bdev is same as its target device. + */ +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + struct block_device *bdev = sbi->sb->s_bdev; + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + if (FDEV(i).start_blk <= blk_addr && + FDEV(i).end_blk >= blk_addr) { + blk_addr -= FDEV(i).start_blk; + bdev = FDEV(i).bdev; + break; + } + } + if (bio) { + bio->bi_bdev = bdev; + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + } + return bdev; +} + +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr) + return i; + return 0; +} + +static bool __same_bdev(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio) +{ + return f2fs_target_device(sbi, blk_addr, NULL) == bio->bi_bdev; +} + /* * Low-level block read/write IO operations. */ @@ -98,8 +138,7 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, bio = f2fs_bio_alloc(npages); - bio->bi_bdev = sbi->sb->s_bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr); + f2fs_target_device(sbi, blk_addr, bio); bio->bi_end_io = is_read ? f2fs_read_end_io : f2fs_write_end_io; bio->bi_private = is_read ? NULL : sbi; @@ -269,7 +308,8 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - io->fio.rw != fio->rw)) + (io->fio.rw != fio->rw) || + !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { @@ -956,7 +996,6 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct fscrypt_ctx *ctx = NULL; - struct block_device *bdev = sbi->sb->s_bdev; struct bio *bio; if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { @@ -974,8 +1013,7 @@ static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, fscrypt_release_ctx(ctx); return ERR_PTR(-ENOMEM); } - bio->bi_bdev = bdev; - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blkaddr); + f2fs_target_device(sbi, blkaddr, bio); bio->bi_end_io = f2fs_read_end_io; bio->bi_private = ctx; @@ -1069,7 +1107,8 @@ static int f2fs_mpage_readpages(struct address_space *mapping, * This page will go to BIO. Do we need to send this * BIO off first? */ - if (bio && (last_block_in_bio != block_nr - 1)) { + if (bio && (last_block_in_bio != block_nr - 1 || + !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); bio = NULL; @@ -1737,6 +1776,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, return 0; if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) return 0; + if (F2FS_I_SB(inode)->s_ndevs) + return 0; if (trace_android_fs_dataread_start_enabled() && (iov_iter_rw(iter) == READ)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4d4bfbaeb788..04f6dddc6d91 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -756,6 +756,20 @@ struct f2fs_bio_info { struct rw_semaphore io_rwsem; /* blocking op for bio */ }; +#define FDEV(i) (sbi->devs[i]) +#define RDEV(i) (raw_super->devs[i]) +struct f2fs_dev_info { + struct block_device *bdev; + char path[MAX_PATH_LEN]; + unsigned int total_segments; + block_t start_blk; + block_t end_blk; +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int nr_blkz; /* Total number of zones */ + u8 *blkz_type; /* Array of zones type */ +#endif +}; + enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ @@ -804,10 +818,8 @@ struct f2fs_sb_info { #endif #ifdef CONFIG_BLK_DEV_ZONED - unsigned int nr_blkz; /* Total number of zones */ unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ - u8 *blkz_type; /* Array of zones type */ #endif /* for node-related operations */ @@ -924,6 +936,8 @@ struct f2fs_sb_info { /* For shrinker support */ struct list_head s_list; + int s_ndevs; /* number of devices */ + struct f2fs_dev_info *devs; /* for device list */ struct mutex umount_mutex; unsigned int shrinker_run_no; @@ -2190,6 +2204,9 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); void f2fs_submit_page_mbio(struct f2fs_io_info *); +struct block_device *f2fs_target_device(struct f2fs_sb_info *, + block_t, struct bio *); +int f2fs_target_device_index(struct f2fs_sb_info *, block_t); void set_data_blkaddr(struct dnode_of_data *); void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); @@ -2477,11 +2494,15 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, - block_t blkaddr) + struct block_device *bdev, block_t blkaddr) { unsigned int zno = blkaddr >> sbi->log_blocks_per_blkz; + int i; - return sbi->blkz_type[zno]; + for (i = 0; i < sbi->s_ndevs; i++) + if (FDEV(i).bdev == bdev) + return FDEV(i).blkz_type[zno]; + return -EINVAL; } #endif diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 19ab2e63d8d7..30d0e9a76c62 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -401,6 +401,32 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } +static int __submit_flush_wait(struct block_device *bdev) +{ + struct bio *bio = f2fs_bio_alloc(0); + int ret; + + bio->bi_bdev = bdev; + ret = submit_bio_wait(WRITE_FLUSH, bio); + bio_put(bio); + return ret; +} + +static int submit_flush_wait(struct f2fs_sb_info *sbi) +{ + int ret = __submit_flush_wait(sbi->sb->s_bdev); + int i; + + if (sbi->s_ndevs && !ret) { + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(FDEV(i).bdev); + if (ret) + break; + } + } + return ret; +} + static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; @@ -411,24 +437,18 @@ static int issue_flush_thread(void *data) return 0; if (!llist_empty(&fcc->issue_list)) { - struct bio *bio; struct flush_cmd *cmd, *next; int ret; - bio = f2fs_bio_alloc(0); - fcc->dispatch_list = llist_del_all(&fcc->issue_list); fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); - + ret = submit_flush_wait(sbi); llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; complete(&cmd->wait); } - bio_put(bio); fcc->dispatch_list = NULL; } @@ -449,14 +469,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) return 0; if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - struct bio *bio = f2fs_bio_alloc(0); int ret; atomic_inc(&fcc->submit_flush); - bio->bi_bdev = sbi->sb->s_bdev; - ret = submit_bio_wait(WRITE_FLUSH, bio); + ret = submit_flush_wait(sbi); atomic_dec(&fcc->submit_flush); - bio_put(bio); return ret; } @@ -586,18 +603,24 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } #ifdef CONFIG_BLK_DEV_ZONED -static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t sector = SECTOR_FROM_BLOCK(blkstart); sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - struct block_device *bdev = sbi->sb->s_bdev; + sector_t sector; + int devi = 0; - if (nr_sects != bdev_zone_size(bdev)) { + if (sbi->s_ndevs) { + devi = f2fs_target_device_index(sbi, blkstart); + blkstart -= FDEV(devi).start_blk; + } + sector = SECTOR_FROM_BLOCK(blkstart); + + if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, - "Unaligned discard attempted (sector %llu + %llu)", - (unsigned long long)sector, - (unsigned long long)nr_sects); + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); return -EIO; } @@ -606,7 +629,7 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, * use regular discard if the drive supports it. For sequential * zones, reset the zone write pointer. */ - switch (get_blkz_type(sbi, blkstart)) { + switch (get_blkz_type(sbi, bdev, blkstart)) { case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) @@ -625,29 +648,60 @@ static int f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } #endif -static int f2fs_issue_discard(struct f2fs_sb_info *sbi, - block_t blkstart, block_t blklen) +static int __issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) { sector_t start = SECTOR_FROM_BLOCK(blkstart); sector_t len = SECTOR_FROM_BLOCK(blklen); + +#ifdef CONFIG_BLK_DEV_ZONED + if (f2fs_sb_mounted_blkzoned(sbi->sb) && + bdev_zoned_model(bdev) != BLK_ZONED_NONE) + return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); +#endif + return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); +} + +static int f2fs_issue_discard(struct f2fs_sb_info *sbi, + block_t blkstart, block_t blklen) +{ + sector_t start = blkstart, len = 0; + struct block_device *bdev; struct seg_entry *se; unsigned int offset; block_t i; + int err = 0; + + bdev = f2fs_target_device(sbi, blkstart, NULL); + + for (i = blkstart; i < blkstart + blklen; i++, len++) { + if (i != start) { + struct block_device *bdev2 = + f2fs_target_device(sbi, i, NULL); + + if (bdev2 != bdev) { + err = __issue_discard_async(sbi, bdev, + start, len); + if (err) + return err; + bdev = bdev2; + start = i; + len = 0; + } + } - for (i = blkstart; i < blkstart + blklen; i++) { se = get_seg_entry(sbi, GET_SEGNO(sbi, i)); offset = GET_BLKOFF_FROM_SEG0(sbi, i); if (!f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); -#ifdef CONFIG_BLK_DEV_ZONED - if (f2fs_sb_mounted_blkzoned(sbi->sb)) - return f2fs_issue_discard_zone(sbi, blkstart, blklen); -#endif - return blkdev_issue_discard(sbi->sb->s_bdev, start, len, GFP_NOFS, 0); + if (len) + err = __issue_discard_async(sbi, bdev, start, len); + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + return err; } static void __add_discard_entry(struct f2fs_sb_info *sbi, diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 7a0634b0bee8..2d332a16de71 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -713,6 +713,19 @@ static void destroy_percpu_info(struct f2fs_sb_info *sbi) percpu_counter_destroy(&sbi->total_valid_inode_count); } +static void destroy_device_list(struct f2fs_sb_info *sbi) +{ + int i; + + for (i = 0; i < sbi->s_ndevs; i++) { + blkdev_put(FDEV(i).bdev, FMODE_EXCL); +#ifdef CONFIG_BLK_DEV_ZONED + kfree(FDEV(i).blkz_type); +#endif + } + kfree(sbi->devs); +} + static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -773,6 +786,8 @@ static void f2fs_put_super(struct super_block *sb) crypto_free_shash(sbi->s_chksum_driver); kfree(sbi->raw_super); + destroy_device_list(sbi); + destroy_percpu_info(sbi); kfree(sbi); } @@ -1517,9 +1532,9 @@ static int init_percpu_info(struct f2fs_sb_info *sbi) } #ifdef CONFIG_BLK_DEV_ZONED -static int init_blkz_info(struct f2fs_sb_info *sbi) +static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) { - struct block_device *bdev = sbi->sb->s_bdev; + struct block_device *bdev = FDEV(devi).bdev; sector_t nr_sectors = bdev->bd_part->nr_sects; sector_t sector = 0; struct blk_zone *zones; @@ -1530,15 +1545,21 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) if (!f2fs_sb_mounted_blkzoned(sbi->sb)) return 0; + if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != + SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + return -EINVAL; sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != + __ilog2_u32(sbi->blocks_per_blkz)) + return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); - sbi->nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> - sbi->log_blocks_per_blkz; + FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> + sbi->log_blocks_per_blkz; if (nr_sectors & (bdev_zone_size(bdev) - 1)) - sbi->nr_blkz++; + FDEV(devi).nr_blkz++; - sbi->blkz_type = kmalloc(sbi->nr_blkz, GFP_KERNEL); - if (!sbi->blkz_type) + FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); + if (!FDEV(devi).blkz_type) return -ENOMEM; #define F2FS_REPORT_NR_ZONES 4096 @@ -1563,7 +1584,7 @@ static int init_blkz_info(struct f2fs_sb_info *sbi) } for (i = 0; i < nr_zones; i++) { - sbi->blkz_type[n] = zones[i].type; + FDEV(devi).blkz_type[n] = zones[i].type; sector += zones[i].len; n++; } @@ -1667,6 +1688,77 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) return err; } +static int f2fs_scan_devices(struct f2fs_sb_info *sbi) +{ + struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + int i; + + for (i = 0; i < MAX_DEVICES; i++) { + if (!RDEV(i).path[0]) + return 0; + + if (i == 0) { + sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * + MAX_DEVICES, GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; + } + + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + if (IS_ERR(FDEV(i).bdev)) + return PTR_ERR(FDEV(i).bdev); + + /* to release errored devices */ + sbi->s_ndevs = i + 1; + +#ifdef CONFIG_BLK_DEV_ZONED + if (bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HM && + !f2fs_sb_mounted_blkzoned(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Zoned block device feature not enabled\n"); + return -EINVAL; + } + if (bdev_zoned_model(FDEV(i).bdev) != BLK_ZONED_NONE) { + if (init_blkz_info(sbi, i)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Failed to initialize F2FS blkzone information"); + return -EINVAL; + } + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk, + bdev_zoned_model(FDEV(i).bdev) == BLK_ZONED_HA ? + "Host-aware" : "Host-managed"); + continue; + } +#endif + f2fs_msg(sbi->sb, KERN_INFO, + "Mount Device [%2d]: %20s, %8u, %8x - %8x", + i, FDEV(i).path, + FDEV(i).total_segments, + FDEV(i).start_blk, FDEV(i).end_blk); + } + return 0; +} + static int f2fs_fill_super(struct super_block *sb, void *data, int silent) { struct f2fs_sb_info *sbi; @@ -1725,15 +1817,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) "Zoned block device support is not enabled\n"); goto free_sb_buf; } -#else - if (bdev_zoned_model(sb->s_bdev) == BLK_ZONED_HM && - !f2fs_sb_mounted_blkzoned(sb)) { - f2fs_msg(sb, KERN_ERR, - "Zoned block device feature not enabled\n"); - goto free_sb_buf; - } #endif - default_options(sbi); /* parse mount options */ options = kstrdup((const char *)data, GFP_KERNEL); @@ -1803,6 +1887,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_meta_inode; } + /* Initialize device list */ + err = f2fs_scan_devices(sbi); + if (err) { + f2fs_msg(sb, KERN_ERR, "Failed to find devices"); + goto free_devices; + } + sbi->total_valid_node_count = le32_to_cpu(sbi->ckpt->valid_node_count); percpu_counter_set(&sbi->total_valid_inode_count, @@ -1821,15 +1912,6 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) init_ino_entry_info(sbi); -#ifdef CONFIG_BLK_DEV_ZONED - err = init_blkz_info(sbi); - if (err) { - f2fs_msg(sb, KERN_ERR, - "Failed to initialize F2FS blkzone information"); - goto free_blkz; - } -#endif - /* setup f2fs internal modules */ err = build_segment_manager(sbi); if (err) { @@ -2008,10 +2090,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) destroy_node_manager(sbi); free_sm: destroy_segment_manager(sbi); -#ifdef CONFIG_BLK_DEV_ZONED -free_blkz: - kfree(sbi->blkz_type); -#endif +free_devices: + destroy_device_list(sbi); kfree(sbi->ckpt); free_meta_inode: make_bad_inode(sbi->meta_inode); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index e46e7d10312b..3e5972ef5019 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -52,10 +52,17 @@ #define VERSION_LEN 256 #define MAX_VOLUME_NAME 512 +#define MAX_PATH_LEN 64 +#define MAX_DEVICES 8 /* * For superblock */ +struct f2fs_device { + __u8 path[MAX_PATH_LEN]; + __le32 total_segments; +} __packed; + struct f2fs_super_block { __le32 magic; /* Magic Number */ __le16 major_ver; /* Major Version */ @@ -94,7 +101,8 @@ struct f2fs_super_block { __le32 feature; /* defined features */ __u8 encryption_level; /* versioning level for encryption */ __u8 encrypt_pw_salt[16]; /* Salt used for string2key algorithm */ - __u8 reserved[871]; /* valid reserved region */ + struct f2fs_device devs[MAX_DEVICES]; /* device list */ + __u8 reserved[327]; /* valid reserved region */ } __packed; /* From 9f13883d394ab76e0e768e8c0a30f0f9d5c22e96 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:31:56 -0800 Subject: [PATCH 451/546] f2fs: use err for f2fs_preallocate_blocks commit a7de608691f766cd148971a71d4f13aa1692d4c8 upstream. This patch has no functional change. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 26 +++++++++++++------------- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 35 +++++++++++++++++++---------------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 31e9c64e0f1d..87e07665832e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -653,11 +653,11 @@ static int __allocate_data_block(struct dnode_of_data *dn) return 0; } -ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); struct f2fs_map_blocks map; - ssize_t ret = 0; + int err = 0; map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); @@ -669,19 +669,19 @@ ssize_t f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) map.m_next_pgofs = NULL; if (iocb->ki_flags & IOCB_DIRECT) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; } if (!f2fs_has_inline_data(inode)) return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - return ret; + return err; } /* @@ -858,19 +858,19 @@ static int __get_data_block(struct inode *inode, sector_t iblock, pgoff_t *next_pgofs) { struct f2fs_map_blocks map; - int ret; + int err; map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; map.m_next_pgofs = next_pgofs; - ret = f2fs_map_blocks(inode, &map, create, flag); - if (!ret) { + err = f2fs_map_blocks(inode, &map, create, flag); + if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; bh->b_size = map.m_len << inode->i_blkbits; } - return ret; + return err; } static int get_data_block(struct inode *inode, sector_t iblock, diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 04f6dddc6d91..8dc378d82f67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2212,7 +2212,7 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); int reserve_new_block(struct dnode_of_data *); int f2fs_get_block(struct dnode_of_data *, pgoff_t); -ssize_t f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); +int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); struct page *find_data_page(struct inode *, pgoff_t); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index ce38a350fb38..fbfcd809baec 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1324,15 +1324,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, pgoff_t pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; - int ret; + int err; - ret = inode_newsize_ok(inode, (len + offset)); - if (ret) - return ret; + err = inode_newsize_ok(inode, (len + offset)); + if (err) + return err; - ret = f2fs_convert_inline_inode(inode); - if (ret) - return ret; + err = f2fs_convert_inline_inode(inode); + if (err) + return err; f2fs_balance_fs(sbi, true); @@ -1344,12 +1344,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (off_end) map.m_len++; - ret = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); - if (ret) { + err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + if (err) { pgoff_t last_off; if (!map.m_len) - return ret; + return err; last_off = map.m_lblk + map.m_len - 1; @@ -1363,7 +1363,7 @@ static int expand_inode_data(struct inode *inode, loff_t offset, if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); - return ret; + return err; } static long f2fs_fallocate(struct file *file, int mode, @@ -2267,12 +2267,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - ret = f2fs_preallocate_blocks(iocb, from); - if (!ret) { - blk_start_plug(&plug); - ret = __generic_file_write_iter(iocb, from); - blk_finish_plug(&plug); + int err = f2fs_preallocate_blocks(iocb, from); + + if (err) { + inode_unlock(inode); + return err; } + blk_start_plug(&plug); + ret = __generic_file_write_iter(iocb, from); + blk_finish_plug(&plug); } inode_unlock(inode); From c678a1a897f001e49085cad430816d830e9bf865 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 11 Nov 2016 16:46:40 -0800 Subject: [PATCH 452/546] f2fs: fix redundant block allocation commit c040ff9d69fd1d782fe577ba9e35c1f5798158ae upstream. In direct_IO path of f2fs_file_write_iter(), 1. f2fs_preallocate_blocks(F2FS_GET_BLOCK_PRE_DIO) -> allocate LBA X 2. f2fs_direct_IO() -> return 0; Then, f2fs_write_data_page() will allocate another LBA X+1. This makes EIO triggered by HM-SMR. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 87e07665832e..ed1ac24ba7c4 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -653,6 +653,13 @@ static int __allocate_data_block(struct dnode_of_data *dn) return 0; } +static inline bool __force_buffered_io(struct inode *inode, int rw) +{ + return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || + F2FS_I_SB(inode)->s_ndevs); +} + int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) { struct inode *inode = file_inode(iocb->ki_filp); @@ -672,7 +679,10 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) err = f2fs_convert_inline_inode(inode); if (err) return err; - return f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_DIO); + return f2fs_map_blocks(inode, &map, 1, + __force_buffered_io(inode, WRITE) ? + F2FS_GET_BLOCK_PRE_AIO : + F2FS_GET_BLOCK_PRE_DIO); } if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { err = f2fs_convert_inline_inode(inode); @@ -1772,11 +1782,7 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, if (err) return err; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return 0; - if (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) - return 0; - if (F2FS_I_SB(inode)->s_ndevs) + if (__force_buffered_io(inode, rw)) return 0; if (trace_android_fs_dataread_start_enabled() && From 3857ba036b8e77d20d8e8e6197771373b3513e21 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 17:38:35 -0800 Subject: [PATCH 453/546] f2fs: avoid BG_GC in f2fs_balance_fs commit 7702bdbe505a22380dd958e2ee35124c7c414806 upstream. If many threads hit has_not_enough_free_secs() in f2fs_balance_fs() at the same time, all the threads would do FG_GC or BG_GC. In this critical path, we totally don't need to do BG_GC at all. Let's avoid that. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/file.c | 2 +- fs/f2fs/gc.c | 7 +++++-- fs/f2fs/segment.c | 2 +- 4 files changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 8dc378d82f67..687ab43a6cd8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2235,7 +2235,7 @@ int f2fs_migrate_page(struct address_space *, struct page *, struct page *, int start_gc_thread(struct f2fs_sb_info *); void stop_gc_thread(struct f2fs_sb_info *); block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool); +int f2fs_gc(struct f2fs_sb_info *, bool, bool); void build_gc_manager(struct f2fs_sb_info *); /* diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index fbfcd809baec..84f4572ae959 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1853,7 +1853,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync); + ret = f2fs_gc(sbi, sync, true); out: mnt_drop_write_file(filp); return ret; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 744031194934..54d06c21af07 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -82,7 +82,7 @@ static int gc_thread_func(void *data) stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC))) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -905,7 +905,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, return sec_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) { unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; @@ -946,6 +946,9 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync) if (ret) goto stop; } + } else if (gc_type == BG_GC && !background) { + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + goto stop; } if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 30d0e9a76c62..27e1b7c56e4c 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -364,7 +364,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false); + f2fs_gc(sbi, false, false); } } From 028f924c99b827d10f52f7b1770f92062b085a78 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 14 Nov 2016 18:20:10 -0800 Subject: [PATCH 454/546] f2fs: fix wrong written_valid_blocks counting commit c79b7ff1d3c7710c23d8828a69d8cbc5597ad19f upstream. Previously, written_valid_blocks was got by ckpt->valid_block_count. But if the last checkpoint has some NEW_ADDR due to power-cut, we can get wrong value. Fix it to get the number from actual written block count from sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 27e1b7c56e4c..58cae4a541a7 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2176,7 +2176,6 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) static int build_sit_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); - struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; char *src_bitmap, *dst_bitmap; @@ -2243,7 +2242,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; - sit_i->written_valid_blocks = le64_to_cpu(ckpt->valid_block_count); + sit_i->written_valid_blocks = 0; sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; @@ -2397,6 +2396,9 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) struct seg_entry *sentry = get_seg_entry(sbi, start); if (!sentry->valid_blocks) __set_free(sbi, start); + else + SIT_I(sbi)->written_valid_blocks += + sentry->valid_blocks; } /* set use the current segments */ From c6e489e887e36830643f231a17029296b2a707d3 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Wed, 16 Nov 2016 10:41:20 +0800 Subject: [PATCH 455/546] f2fs: don't wait writeback for datas during checkpoint commit 36951b38d13ac7cce9fcf89e0e01c22ed0d05688 upstream. Normally, while committing checkpoint, we will wait on all pages to be writebacked no matter the page is data or metadata, so in scenario where there are lots of data IO being submitted with metadata, we may suffer long latency for waiting writeback during checkpoint. Indeed, we only care about persistence for pages with metadata, but not pages with data, as file system consistent are only related to metadate, so in order to avoid encountering long latency in above scenario, let's recognize and reference metadata in submitted IOs, wait writeback only for metadatas. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 2 +- fs/f2fs/data.c | 35 +++++++++++++++++++++++++++++------ fs/f2fs/debug.c | 7 ++++--- fs/f2fs/f2fs.h | 9 ++++++--- fs/f2fs/file.c | 2 -- fs/f2fs/gc.c | 2 -- fs/f2fs/segment.c | 1 - 7 files changed, 40 insertions(+), 18 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index ed79757c36e0..889317e07122 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1005,7 +1005,7 @@ static void wait_on_all_pages_writeback(struct f2fs_sb_info *sbi) for (;;) { prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE); - if (!atomic_read(&sbi->nr_wb_bios)) + if (!get_pages(sbi, F2FS_WB_CP_DATA)) break; io_schedule_timeout(5*HZ); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ed1ac24ba7c4..fa254daac970 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -30,6 +30,26 @@ #include #include +static bool __is_cp_guaranteed(struct page *page) +{ + struct address_space *mapping = page->mapping; + struct inode *inode; + struct f2fs_sb_info *sbi; + + if (!mapping) + return false; + + inode = mapping->host; + sbi = F2FS_I_SB(inode); + + if (inode->i_ino == F2FS_META_INO(sbi) || + inode->i_ino == F2FS_NODE_INO(sbi) || + S_ISDIR(inode->i_mode) || + is_cold_data(page)) + return true; + return false; +} + static void f2fs_read_end_io(struct bio *bio) { struct bio_vec *bvec; @@ -72,6 +92,7 @@ static void f2fs_write_end_io(struct bio *bio) bio_for_each_segment_all(bvec, bio, i) { struct page *page = bvec->bv_page; + enum count_type type = WB_DATA_TYPE(page); fscrypt_pullback_bio_page(&page, true); @@ -79,9 +100,11 @@ static void f2fs_write_end_io(struct bio *bio) set_bit(AS_EIO, &page->mapping->flags); f2fs_stop_checkpoint(sbi, true); } + dec_page_count(sbi, type); + clear_cold_data(page); end_page_writeback(page); } - if (atomic_dec_and_test(&sbi->nr_wb_bios) && + if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) wake_up(&sbi->cp_wait); @@ -149,7 +172,6 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, struct bio *bio, enum page_type type) { if (!is_read_io(rw)) { - atomic_inc(&sbi->nr_wb_bios); if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); @@ -305,6 +327,11 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) verify_block_addr(sbi, fio->old_blkaddr); verify_block_addr(sbi, fio->new_blkaddr); + bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; + + if (!is_read) + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || @@ -318,8 +345,6 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io->fio = *fio; } - bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); @@ -1331,7 +1356,6 @@ static int f2fs_write_data_page(struct page *page, if (err && err != -ENOENT) goto redirty_out; - clear_cold_data(page); out: inode_dec_dirty_pages(inode); if (err) @@ -1745,7 +1769,6 @@ static int f2fs_write_end(struct file *file, goto unlock_out; set_page_dirty(page); - clear_cold_data(page); if (pos + copied > i_size_read(inode)) f2fs_i_size_write(inode, pos + copied); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 678733c78f55..fbd5184140d0 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,7 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); - si->wb_bios = atomic_read(&sbi->nr_wb_bios); + si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); + si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -313,8 +314,8 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_bios: %4d\n", - si->inmem_pages, si->wb_bios); + seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", + si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 687ab43a6cd8..d6119ea3b86d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -702,6 +702,7 @@ struct f2fs_sm_info { * f2fs monitors the number of several block types such as on-writeback, * dirty dentry blocks, dirty node blocks, and dirty meta blocks. */ +#define WB_DATA_TYPE(p) (__is_cp_guaranteed(p) ? F2FS_WB_CP_DATA : F2FS_WB_DATA) enum count_type { F2FS_DIRTY_DENTS, F2FS_DIRTY_DATA, @@ -709,6 +710,8 @@ enum count_type { F2FS_DIRTY_META, F2FS_INMEM_PAGES, F2FS_DIRTY_IMETA, + F2FS_WB_CP_DATA, + F2FS_WB_DATA, NR_COUNT_TYPE, }; @@ -888,7 +891,6 @@ struct f2fs_sb_info { block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ u32 s_next_generation; /* for NFS support */ - atomic_t nr_wb_bios; /* # of writeback bios */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1302,7 +1304,8 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) { atomic_inc(&sbi->nr_pages[count_type]); - if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES) + if (count_type == F2FS_DIRTY_DATA || count_type == F2FS_INMEM_PAGES || + count_type == F2FS_WB_CP_DATA || count_type == F2FS_WB_DATA) return; set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -2261,7 +2264,7 @@ struct f2fs_stat_info { unsigned int ndirty_dirs, ndirty_files, ndirty_all; int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; int total_count, utilization; - int bg_gc, wb_bios; + int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 84f4572ae959..bab65f0a5bb5 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -95,8 +95,6 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); - /* if gced page is attached, don't write to cold segment */ - clear_cold_data(page); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 54d06c21af07..6390d45c1b68 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -690,8 +690,6 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, congestion_wait(BLK_RW_ASYNC, HZ/50); goto retry; } - - clear_cold_data(page); } out: f2fs_put_page(page, 1); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 58cae4a541a7..23e8892c4e60 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -288,7 +288,6 @@ static int __commit_inmem_pages(struct inode *inode, /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - clear_cold_data(page); submit_bio = true; } unlock_page(page); From d197d6732e8cf14c4465efc7945c5caab8984637 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Wed, 16 Nov 2016 17:26:24 +0800 Subject: [PATCH 456/546] f2fs: fix an infinite loop when flush nodes in cp commit d40a43af0a57a017eba9ad2679183791587ceb6a upstream. Thread A Thread B - write_checkpoint - block_operations -blk_start_plug -sync_node_pages - f2fs_do_sync_file - fsync_node_pages - f2fs_wait_on_page_writeback Thread A wait for global F2FS_DIRTY_NODES decreased to zero, it start a plug list, some requests have been added to this list. Thread B lock one dirty node page, and wait this page write back. But this page has been in plug list of thread A with PG_writeback flag. Thread A keep on running and its plug list has no chance to finish, so it seems a deadlock between cp and fsync path. This patch add a wait on page write back before set node page dirty to avoid this problem. Signed-off-by: Yunlei He Signed-off-by: Pengyang Hou Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 389be7f6e07c..59cc29e6b73c 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1409,6 +1409,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_page->index); lock_page(last_page); + f2fs_wait_on_page_writeback(last_page, NODE, true); set_page_dirty(last_page); unlock_page(last_page); goto retry; From c2cec7e4ab32120613b7a462c8bb9b47ca0c726e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:11 +0800 Subject: [PATCH 457/546] f2fs: fix to account total free nid correctly commit 04d47e673863c637a2b44ad34a558aeb5d0a727e upstream. Thread A Thread B Thread C - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid alloc last nid - f2fs_unlock_op - f2fs_create - f2fs_new_inode - f2fs_lock_op - alloc_nid as node count still not be increased, we will loop in alloc_nid - f2fs_write_node_pages - f2fs_balance_fs_bg - f2fs_sync_fs - write_checkpoint - block_operations - f2fs_lock_all - f2fs_lock_op While creating new inode, we do not allocate and account nid atomically, so that when there is almost no free nids left, we may encounter deadloop like above stack. In order to avoid that, reuse nm_i::available_nids for accounting free nids and make nid allocation and counting being atomical during node creation. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/node.c | 34 +++++++++++++++++++++++++++++----- 2 files changed, 30 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d6119ea3b86d..973ca74404de 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -560,7 +560,7 @@ enum nid_list { struct f2fs_nm_info { block_t nat_blkaddr; /* base disk address of NAT */ nid_t max_nid; /* maximum possible node ids */ - nid_t available_nids; /* maximum available node ids */ + nid_t available_nids; /* # of available node ids */ nid_t next_scan_nid; /* the next nid to be scanned */ unsigned int ram_thresh; /* control the memory footprint */ unsigned int ra_nid_pages; /* # of nid pages to be readaheaded */ diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 59cc29e6b73c..edacbabb92cf 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1885,11 +1885,13 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) if (time_to_inject(sbi, FAULT_ALLOC_NID)) return false; #endif - if (unlikely(sbi->total_valid_node_count + 1 > nm_i->available_nids)) - return false; - spin_lock(&nm_i->nid_list_lock); + if (unlikely(nm_i->available_nids == 0)) { + spin_unlock(&nm_i->nid_list_lock); + return false; + } + /* We should not use stale free nids created by build_free_nids */ if (nm_i->nid_cnt[FREE_NID_LIST] && !on_build_free_nids(nm_i)) { f2fs_bug_on(sbi, list_empty(&nm_i->nid_list[FREE_NID_LIST])); @@ -1900,6 +1902,7 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) __remove_nid_from_list(sbi, i, FREE_NID_LIST, true); i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); + nm_i->available_nids--; spin_unlock(&nm_i->nid_list_lock); return true; } @@ -1951,6 +1954,9 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) i->state = NID_NEW; __insert_nid_to_list(sbi, i, FREE_NID_LIST, false); } + + nm_i->available_nids++; + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2150,6 +2156,19 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = grab_nat_entry(nm_i, nid); node_info_from_raw_nat(&ne->ni, &raw_ne); } + + /* + * if a free nat in journal has not been used after last + * checkpoint, we should remove it from available nids, + * since later we will add it again. + */ + if (!get_nat_flag(ne, IS_DIRTY) && + le32_to_cpu(raw_ne.block_addr) == NULL_ADDR) { + spin_lock(&nm_i->nid_list_lock); + nm_i->available_nids--; + spin_unlock(&nm_i->nid_list_lock); + } + __set_nat_cache_dirty(nm_i, ne); } update_nats_in_cursum(journal, -i); @@ -2222,8 +2241,12 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); __clear_nat_cache_dirty(NM_I(sbi), ne); - if (nat_get_blkaddr(ne) == NULL_ADDR) + if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); + spin_lock(&NM_I(sbi)->nid_list_lock); + NM_I(sbi)->available_nids++; + spin_unlock(&NM_I(sbi)->nid_list_lock); + } } if (to_journal) @@ -2298,7 +2321,8 @@ static int init_node_manager(struct f2fs_sb_info *sbi) nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ - nm_i->available_nids = nm_i->max_nid - F2FS_RESERVED_NODE_NUM; + nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - + F2FS_RESERVED_NODE_NUM; nm_i->nid_cnt[FREE_NID_LIST] = 0; nm_i->nid_cnt[ALLOC_NID_LIST] = 0; nm_i->nat_cnt = 0; From 8cc3e395c9227321290abb31754feb9ff174c32e Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Nov 2016 20:53:31 +0800 Subject: [PATCH 458/546] f2fs: fix fdatasync commit 281518c694a5228d6c46fac83529fb3e2c331281 upstream. For below two cases, we can't guarantee data consistence: a) 1. xfs_io "pwrite 0 4195328" "fsync" 2. xfs_io "pwrite 4195328 1024" "fdatasync" 3. godown 4. umount & mount --> isize we updated before fdatasync won't be recovered b) 1. xfs_io "pwrite -S 0xcc 0 4202496" "fsync" 2. xfs_io "fpunch 4194304 4096" "fdatasync" 3. godown 4. umount & mount --> dnode we punched before fdatasync won't be recovered The reason is that normally fdatasync won't be aware of modification of metadata in file, e.g. isize changing, dnode updating, so in ->fsync we will skip flushing node pages for above cases, result in making fdatasynced file being lost during recovery. Currently we have introduced DIRTY_META global list in sbi for tracking dirty inode selectively, so in fdatasync we can choose to flush nodes depend on dirty state of current inode in the list. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 11 ++++++++++- fs/f2fs/file.c | 2 +- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 973ca74404de..16bedd87022d 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1763,8 +1763,17 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode) +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) { + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index bab65f0a5bb5..7fd8e7cffe9b 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -209,7 +209,7 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, } /* if the inode is dirty, let's recover all the time */ - if (!datasync && !f2fs_skip_inode_update(inode)) { + if (!f2fs_skip_inode_update(inode, datasync)) { f2fs_write_inode(inode, NULL); goto go_write; } From 65bdf690a315806986f2c841235dd372dbcd5c48 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 15:09:48 -0800 Subject: [PATCH 459/546] f2fs: do not recover i_size if it's valid commit 3a3a5ead7b6d2c9a29f493791ba23f264052db34 upstream. If i_size is already valid during roll_forward recovery, we should not update it according to the block alignment. Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 1d25768dca33..173212455db8 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -425,7 +425,7 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } - if ((start + 1) << PAGE_SHIFT > i_size_read(inode)) + if (i_size_read(inode) <= (start << PAGE_SHIFT)) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* From 4935b166e9609fd8911f700e268fbd1c1c7203a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 16 Nov 2016 18:53:16 -0800 Subject: [PATCH 460/546] f2fs: fix wrong AUTO_RECOVER condition commit 97dd26ad834739d4e4ea35fd7ab5f92824de4cbb upstream. If i_size is not aligned to the f2fs's block size, we should not skip inode update during fsync. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 16bedd87022d..fa66e5baa58a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1774,7 +1774,8 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) spin_unlock(&sbi->inode_lock[DIRTY_META]); return ret; } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER)) + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + i_size_read(inode) & PAGE_MASK) return false; return F2FS_I(inode)->last_disk_size == i_size_read(inode); } From 4816405177cd809dd782af0e1cefbb8b9cfe95bb Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Fri, 18 Nov 2016 22:21:13 +0800 Subject: [PATCH 461/546] f2fs: drop duplicate header timer.h commit b4ceec29219e340178baa9c5f17bf97a42951cc8 upstream. Drop duplicate header timer.h from segment.c. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 23e8892c4e60..ba715d60c738 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,7 +16,6 @@ #include #include #include -#include #include "f2fs.h" #include "segment.h" From 8d0c7ea77590e8c7aed54a68cf178522449d8848 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Fri, 18 Nov 2016 22:27:41 +0800 Subject: [PATCH 462/546] f2fs: fix incorrect free inode count in ->statfs commit b08b12d2ddc85b977a0531470cf6a7158289aaaf upstream. While calculating inode count that we can create at most in the left space, we should consider space which data/node blocks occupied, since we create data/node mixly in main area. So fix the wrong calculation in ->statfs. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/super.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 2d332a16de71..a288456c17c0 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -852,7 +852,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_bavail = user_block_count - valid_user_blocks(sbi); buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = buf->f_files - valid_inode_count(sbi); + buf->f_ffree = min(buf->f_files - valid_node_count(sbi), + buf->f_bavail); buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; From 248e942e558596e24967ef9642d7351b33aa3ccf Mon Sep 17 00:00:00 2001 From: Arnd Bergmann Date: Tue, 22 Nov 2016 15:20:16 +0100 Subject: [PATCH 463/546] f2fs: fix 32-bit build commit 19c526515f6b998039d5d71fea879d255f173746 upstream. The addition of multiple-device support broke CONFIG_BLK_DEV_ZONED on 32-bit machines because of a 64-bit division: fs/f2fs/f2fs.o: In function `__issue_discard_async': extent_cache.c:(.text.__issue_discard_async+0xd4): undefined reference to `__aeabi_uldivmod' Fortunately, bdev_zone_size() is guaranteed to return a power-of-two number, so we can replace the % operator with a cheaper bit mask. Fixes: 792b84b74b54 ("f2fs: support multiple devices") Signed-off-by: Arnd Bergmann Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index ba715d60c738..4f557e5e789d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -614,7 +614,8 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, } sector = SECTOR_FROM_BLOCK(blkstart); - if (sector % bdev_zone_size(bdev) || nr_sects != bdev_zone_size(bdev)) { + if (sector & (bdev_zone_size(bdev) - 1) || + nr_sects != bdev_zone_size(bdev)) { f2fs_msg(sbi->sb, KERN_INFO, "(%d) %s: Unaligned discard attempted (block %x + %x)", devi, sbi->s_ndevs ? FDEV(devi).path: "", From f5cd931dc07de11948303a4a929bad0b1dbd9156 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 28 Nov 2016 15:33:38 -0800 Subject: [PATCH 464/546] f2fs: do not activate auto_recovery for fallocated i_size commit 26787236b36660baf4d136281d40b5bb33a570ec upstream. If a file needs to keep its i_size by fallocate, we need to turn off auto recovery during roll-forward recovery. This will resolve the below scenario. 1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 4096" -c "fsync" 2. xfs_io -f /mnt/f2fs/file -c "falloc -k 4096 4096" -c "fsync" 3. md5sum /mnt/f2fs/file; 4. godown /mnt/f2fs/ 5. umount /mnt/f2fs/ 6. mount -t f2fs /dev/sdx /mnt/f2fs 7. md5sum /mnt/f2fs/file Reported-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 38 +++++++++++++++++++++----------------- fs/f2fs/file.c | 2 ++ fs/f2fs/recovery.c | 11 ++++++++--- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index fa66e5baa58a..4f1046be0d74 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -449,6 +449,7 @@ struct f2fs_map_blocks { #define FADVISE_LOST_PINO_BIT 0x02 #define FADVISE_ENCRYPT_BIT 0x04 #define FADVISE_ENC_NAME_BIT 0x08 +#define FADVISE_KEEP_SIZE_BIT 0x10 #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) #define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) @@ -461,6 +462,8 @@ struct f2fs_map_blocks { #define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) +#define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) +#define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) #define DEF_DIR_LEVEL 0 @@ -1763,23 +1766,6 @@ static inline void f2fs_i_size_write(struct inode *inode, loff_t i_size) set_inode_flag(inode, FI_AUTO_RECOVER); } -static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) -{ - if (dsync) { - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - bool ret; - - spin_lock(&sbi->inode_lock[DIRTY_META]); - ret = list_empty(&F2FS_I(inode)->gdirty_list); - spin_unlock(&sbi->inode_lock[DIRTY_META]); - return ret; - } - if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || - i_size_read(inode) & PAGE_MASK) - return false; - return F2FS_I(inode)->last_disk_size == i_size_read(inode); -} - static inline void f2fs_i_depth_write(struct inode *inode, unsigned int depth) { F2FS_I(inode)->i_current_depth = depth; @@ -1932,6 +1918,24 @@ static inline void clear_file(struct inode *inode, int type) f2fs_mark_inode_dirty_sync(inode, true); } +static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync) +{ + if (dsync) { + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + bool ret; + + spin_lock(&sbi->inode_lock[DIRTY_META]); + ret = list_empty(&F2FS_I(inode)->gdirty_list); + spin_unlock(&sbi->inode_lock[DIRTY_META]); + return ret; + } + if (!is_inode_flag_set(inode, FI_AUTO_RECOVER) || + file_keep_isize(inode) || + i_size_read(inode) & PAGE_MASK) + return false; + return F2FS_I(inode)->last_disk_size == i_size_read(inode); +} + static inline int f2fs_readonly(struct super_block *sb) { return sb->s_flags & MS_RDONLY; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 7fd8e7cffe9b..57b6dbcbbd88 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1403,6 +1403,8 @@ static long f2fs_fallocate(struct file *file, int mode, if (!ret) { inode->i_mtime = inode->i_ctime = current_time(inode); f2fs_mark_inode_dirty_sync(inode, false); + if (mode & FALLOC_FL_KEEP_SIZE) + file_set_keep_isize(inode); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); } diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 173212455db8..983c35da6bce 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -187,6 +187,8 @@ static void recover_inode(struct inode *inode, struct page *page) inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec); inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec); + F2FS_I(inode)->i_advise = raw->i_advise; + if (file_enc_name(inode)) name = ""; else @@ -425,7 +427,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, continue; } - if (i_size_read(inode) <= (start << PAGE_SHIFT)) + if (!file_keep_isize(inode) && + (i_size_read(inode) <= (start << PAGE_SHIFT))) f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); /* @@ -478,8 +481,10 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_put_dnode(&dn); out: f2fs_msg(sbi->sb, KERN_NOTICE, - "recover_data: ino = %lx, recovered = %d blocks, err = %d", - inode->i_ino, recovered, err); + "recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d", + inode->i_ino, + file_keep_isize(inode) ? "keep" : "recover", + recovered, err); return err; } From 9c9f348d72810331793839fe938db396f2672d02 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 28 Nov 2016 19:13:43 -0800 Subject: [PATCH 465/546] f2fs: return AOP_WRITEPAGE_ACTIVATE for writepage commit 0002b61bdaac732bcff364a18f5bd57c95def0a5 upstream. We should use AOP_WRITEPAGE_ACTIVATE when we bypass writing pages. Signed-off-by: Chao Yu Signed-off-by: Miao Xie Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 11 +++++++++++ 1 file changed, 11 insertions(+) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index fa254daac970..283fc9de4762 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1376,6 +1376,8 @@ static int f2fs_write_data_page(struct page *page, redirty_out: redirty_page_for_writepage(wbc, page); + if (!err) + return AOP_WRITEPAGE_ACTIVATE; unlock_page(page); return err; } @@ -1471,6 +1473,15 @@ static int f2fs_write_cache_pages(struct address_space *mapping, ret = mapping->a_ops->writepage(page, wbc); if (unlikely(ret)) { + /* + * keep nr_to_write, since vfs uses this to + * get # of written pages. + */ + if (ret == AOP_WRITEPAGE_ACTIVATE) { + unlock_page(page); + ret = 0; + continue; + } done_index = page->index + 1; done = 1; break; From 9d8116ef64d96fe809d429489cfac956df511713 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 2 Dec 2016 15:11:32 -0800 Subject: [PATCH 466/546] Revert "f2fs: use percpu_counter for # of dirty pages in inode" commit 204706c7accfabb67b97eef9f9a28361b6201199 upstream. This reverts commit 1beba1b3a953107c3ff5448ab4e4297db4619c76. The perpcu_counter doesn't provide atomicity in single core and consume more DRAM. That incurs fs_mark test failure due to ENOMEM. Cc: stable@vger.kernel.org # 4.7+ Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 10 +++++----- fs/f2fs/file.c | 2 +- fs/f2fs/super.c | 7 +------ 3 files changed, 7 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 4f1046be0d74..c7eb2ff398ce 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -479,7 +479,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags; /* use to pass per-file flags */ struct rw_semaphore i_sem; /* protect fi info */ - struct percpu_counter dirty_pages; /* # of dirty pages */ + atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ nid_t i_xattr_nid; /* node id that contains xattrs */ @@ -1316,7 +1316,7 @@ static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) static inline void inode_inc_dirty_pages(struct inode *inode) { - percpu_counter_inc(&F2FS_I(inode)->dirty_pages); + atomic_inc(&F2FS_I(inode)->dirty_pages); inc_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1332,7 +1332,7 @@ static inline void inode_dec_dirty_pages(struct inode *inode) !S_ISLNK(inode->i_mode)) return; - percpu_counter_dec(&F2FS_I(inode)->dirty_pages); + atomic_dec(&F2FS_I(inode)->dirty_pages); dec_page_count(F2FS_I_SB(inode), S_ISDIR(inode->i_mode) ? F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA); } @@ -1342,9 +1342,9 @@ static inline s64 get_pages(struct f2fs_sb_info *sbi, int count_type) return atomic_read(&sbi->nr_pages[count_type]); } -static inline s64 get_dirty_pages(struct inode *inode) +static inline int get_dirty_pages(struct inode *inode) { - return percpu_counter_sum_positive(&F2FS_I(inode)->dirty_pages); + return atomic_read(&F2FS_I(inode)->dirty_pages); } static inline int get_blocktype_secs(struct f2fs_sb_info *sbi, int block_type) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 57b6dbcbbd88..5c0500813efe 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1537,7 +1537,7 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, - "Unexpected flush for atomic writes: ino=%lu, npages=%lld", + "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); if (ret) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index a288456c17c0..ce09191891f8 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -571,13 +571,9 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) init_once((void *) fi); - if (percpu_counter_init(&fi->dirty_pages, 0, GFP_NOFS)) { - kmem_cache_free(f2fs_inode_cachep, fi); - return NULL; - } - /* Initialize f2fs-specific inode info */ fi->vfs_inode.i_version = 1; + atomic_set(&fi->dirty_pages, 0); fi->i_current_depth = 1; fi->i_advise = 0; init_rwsem(&fi->i_sem); @@ -703,7 +699,6 @@ static void f2fs_i_callback(struct rcu_head *head) static void f2fs_destroy_inode(struct inode *inode) { - percpu_counter_destroy(&F2FS_I(inode)->dirty_pages); call_rcu(&inode->i_rcu, f2fs_i_callback); } From 323d6b4b69b929eee72e2a2532f3d06bfd9af3db Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 11:37:14 -0800 Subject: [PATCH 467/546] f2fs: call sync_fs when f2fs is idle commit f455c8a5f0a24090e99249eb7280012376adec2c upstream. The sync_fs in f2fs_balance_fs_bg must avoid interrupting current user requests. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 4f557e5e789d..b95f07559d90 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -381,12 +381,15 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) else build_free_nids(sbi, false); + if (!is_idle(sbi)) + return; + /* checkpoint is the only way to shrink partial cached entries */ if (!available_free_memory(sbi, NAT_ENTRIES) || !available_free_memory(sbi, INO_ENTRIES) || excess_prefree_segs(sbi) || excess_dirty_nats(sbi) || - (is_idle(sbi) && f2fs_time_over(sbi, CP_TIME))) { + f2fs_time_over(sbi, CP_TIME)) { if (test_opt(sbi, DATA_FLUSH)) { struct blk_plug plug; From 5a28184ecfce68e1cf0329227d1b941077fdee14 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 13:56:04 -0800 Subject: [PATCH 468/546] f2fs: detect wrong layout commit 2040fce83fe17763b07c97c1f691da2bb85e4135 upstream. Previous mkfs.f2fs allows small partition inappropriately, so f2fs should detect that as well. Refer this in f2fs-tools. mkfs.f2fs: detect small partition by overprovision ratio and # of segments Reported-by: Eric Biggers Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.h | 2 ++ fs/f2fs/super.c | 11 +++++++++++ 2 files changed, 13 insertions(+) diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 89ab4301ef02..9d44ce83acb2 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -18,6 +18,8 @@ #define DEF_RECLAIM_PREFREE_SEGMENTS 5 /* 5% over total segments */ #define DEF_MAX_RECLAIM_PREFREE_SEGMENTS 4096 /* 8GB in maximum */ +#define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ + /* L: Logical segment # in volume, R: Relative segment # in main area */ #define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) #define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index ce09191891f8..07f4ba444733 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1453,6 +1453,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) unsigned int total, fsmeta; struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned int ovp_segments, reserved_segments; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1464,6 +1465,16 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) if (unlikely(fsmeta >= total)) return 1; + ovp_segments = le32_to_cpu(ckpt->overprov_segment_count); + reserved_segments = le32_to_cpu(ckpt->rsvd_segment_count); + + if (unlikely(fsmeta < F2FS_MIN_SEGMENTS || + ovp_segments == 0 || reserved_segments == 0)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Wrong layout: check mkfs.f2fs version"); + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; From 3cc247fdfa574c41d0ee41e63fc4d4e272b801a1 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 5 Dec 2016 17:25:32 -0800 Subject: [PATCH 469/546] f2fs: free meta pages if sanity check for ckpt is failed commit a2125ff7dd1ed3a2a53cdc1f8f9c9cec9cfaa7ab upstream. This fixes missing freeing meta pages in the error case. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 889317e07122..640f28576e88 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -768,7 +768,7 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) /* Sanity checking of checkpoint */ if (sanity_check_ckpt(sbi)) - goto fail_no_cp; + goto free_fail_no_cp; if (cur_page == cp1) sbi->cur_cp_pack = 1; @@ -796,6 +796,9 @@ int get_valid_checkpoint(struct f2fs_sb_info *sbi) f2fs_put_page(cp2, 1); return 0; +free_fail_no_cp: + f2fs_put_page(cp1, 1); + f2fs_put_page(cp2, 1); fail_no_cp: kfree(sbi->ckpt); return -EINVAL; From f913880bf2022f8b67d95a6c9e7675cca97d14bf Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 7 Dec 2016 16:23:32 -0800 Subject: [PATCH 470/546] f2fs: fix to access nullified flush_cmd_control pointer commit 5eba8c5d1fb3af28b2073ba5228d4998196c1bcc upstream. f2fs_sync_file() remount_ro - f2fs_readonly - destroy_flush_cmd_control - f2fs_issue_flush - no fcc pointer! So, this patch doesn't free fcc in this case, but just stop its kernel thread which sends flush commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 33 +++++++++++++++++++++++++-------- fs/f2fs/super.c | 5 +++-- 3 files changed, 29 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index c7eb2ff398ce..3ef2d93ab936 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2150,7 +2150,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *, bool); void f2fs_balance_fs_bg(struct f2fs_sb_info *); int f2fs_issue_flush(struct f2fs_sb_info *); int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *); +void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index b95f07559d90..a288de069164 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -486,8 +486,13 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) if (!fcc->dispatch_list) wake_up(&fcc->flush_wait_queue); - wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + if (fcc->f2fs_issue_flush) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->submit_flush); + } else { + llist_del_all(&fcc->issue_list); + atomic_set(&fcc->submit_flush, 0); + } return cmd.ret; } @@ -498,6 +503,11 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; + if (SM_I(sbi)->cmd_control_info) { + fcc = SM_I(sbi)->cmd_control_info; + goto init_thread; + } + fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; @@ -505,6 +515,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->cmd_control_info = fcc; +init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { @@ -517,14 +528,20 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) return err; } -void destroy_flush_cmd_control(struct f2fs_sb_info *sbi) +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; - if (fcc && fcc->f2fs_issue_flush) - kthread_stop(fcc->f2fs_issue_flush); - kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + if (fcc && fcc->f2fs_issue_flush) { + struct task_struct *flush_thread = fcc->f2fs_issue_flush; + + fcc->f2fs_issue_flush = NULL; + kthread_stop(flush_thread); + } + if (free) { + kfree(fcc); + SM_I(sbi)->cmd_control_info = NULL; + } } static void __locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, @@ -2658,7 +2675,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; - destroy_flush_cmd_control(sbi); + destroy_flush_cmd_control(sbi, true); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 07f4ba444733..e6d8d011786c 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1103,8 +1103,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * or if flush_merge is not passed in mount option. */ if ((*flags & MS_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { - destroy_flush_cmd_control(sbi); - } else if (!SM_I(sbi)->cmd_control_info) { + clear_opt(sbi, FLUSH_MERGE); + destroy_flush_cmd_control(sbi, false); + } else { err = create_flush_cmd_control(sbi); if (err) goto restore_gc; From cc0b17f6ccd91b8fe970955b16c2674db10da1c8 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Sun, 11 Dec 2016 15:35:15 +0800 Subject: [PATCH 471/546] f2fs: fix a missing size change in f2fs_setattr commit c0ed4405a99ec9be2a0f062eaafc002d8d26c99f upstream. This patch fix a missing size change in f2fs_setattr Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5c0500813efe..5808d5c709a7 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -678,6 +678,7 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); int err; + bool size_changed = false; err = inode_change_ok(inode, attr); if (err) @@ -708,6 +709,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } inode->i_mtime = inode->i_ctime = current_time(inode); } + + size_changed = true; } __setattr_copy(inode, attr); @@ -720,8 +723,8 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) } } - /* update attributes only */ - f2fs_mark_inode_dirty_sync(inode, false); + /* file size may changed here */ + f2fs_mark_inode_dirty_sync(inode, size_changed); /* inode change will produce dirty node pages flushed by checkpoint */ f2fs_balance_fs(F2FS_I_SB(inode), true); From d8e59293ffe359bf0a94fc69e7f4403aecdfb509 Mon Sep 17 00:00:00 2001 From: Ido Schimmel Date: Thu, 3 Aug 2017 13:28:22 +0200 Subject: [PATCH 472/546] UPSTREAM: ipv6: fib: Unlink replaced routes from their nodes When a route is deleted its node pointer is set to NULL to indicate it's no longer linked to its node. Do the same for routes that are replaced. This will later allow us to test if a route is still in the FIB by checking its node pointer instead of its reference count. Signed-off-by: Ido Schimmel Signed-off-by: Jiri Pirko Signed-off-by: David S. Miller Cherry-pick from: 7483cea79957312e9f8e9cf760a1bc5d6c507113 Bug: 64978549 Change-Id: Ibfa54cf918084138b6b19437e9ef86bfaea5deae --- net/ipv6/ip6_fib.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c index c23e02a7ccb0..bf3824b59597 100644 --- a/net/ipv6/ip6_fib.c +++ b/net/ipv6/ip6_fib.c @@ -909,6 +909,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, fn->fn_flags |= RTN_RTINFO; } nsiblings = iter->rt6i_nsiblings; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; @@ -923,6 +924,7 @@ static int fib6_add_rt2node(struct fib6_node *fn, struct rt6_info *rt, break; if (rt6_qualify_for_ecmp(iter)) { *ins = iter->dst.rt6_next; + iter->rt6i_node = NULL; fib6_purge_rt(iter, fn, info->nl_net); if (fn->rr_ptr == iter) fn->rr_ptr = NULL; From ca027af9a9c7bfa9112ba1fcba1aa6ddd42ad0ce Mon Sep 17 00:00:00 2001 From: Todd Kjos Date: Mon, 25 Sep 2017 08:55:09 -0700 Subject: [PATCH 473/546] FROMLIST: binder: fix use-after-free in binder_transaction() (from https://patchwork.kernel.org/patch/9978801/) User-space normally keeps the node alive when creating a transaction since it has a reference to the target. The local strong ref keeps it alive if the sending process dies before the target process processes the transaction. If the source process is malicious or has a reference counting bug, this can fail. In this case, when we attempt to decrement the node in the failure path, the node has already been freed. This is fixed by taking a tmpref on the node while constructing the transaction. To avoid re-acquiring the node lock and inner proc lock to increment the proc's tmpref, a helper is used that does the ref increments on both the node and proc. Bug: 66899329 Change-Id: Iad40e1e0bccee88234900494fb52a510a37fe8d7 Signed-off-by: Todd Kjos --- drivers/android/binder.c | 95 ++++++++++++++++++++++++++++------------ 1 file changed, 67 insertions(+), 28 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 7c9f9dde8397..32a2b2f44691 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2744,6 +2744,48 @@ static bool binder_proc_transaction(struct binder_transaction *t, return true; } +/** + * binder_get_node_refs_for_txn() - Get required refs on node for txn + * @node: struct binder_node for which to get refs + * @proc: returns @node->proc if valid + * @error: if no @proc then returns BR_DEAD_REPLY + * + * User-space normally keeps the node alive when creating a transaction + * since it has a reference to the target. The local strong ref keeps it + * alive if the sending process dies before the target process processes + * the transaction. If the source process is malicious or has a reference + * counting bug, relying on the local strong ref can fail. + * + * Since user-space can cause the local strong ref to go away, we also take + * a tmpref on the node to ensure it survives while we are constructing + * the transaction. We also need a tmpref on the proc while we are + * constructing the transaction, so we take that here as well. + * + * Return: The target_node with refs taken or NULL if no @node->proc is NULL. + * Also sets @proc if valid. If the @node->proc is NULL indicating that the + * target proc has died, @error is set to BR_DEAD_REPLY + */ +static struct binder_node *binder_get_node_refs_for_txn( + struct binder_node *node, + struct binder_proc **procp, + uint32_t *error) +{ + struct binder_node *target_node = NULL; + + binder_node_inner_lock(node); + if (node->proc) { + target_node = node; + binder_inc_node_nilocked(node, 1, 0, NULL); + binder_inc_node_tmpref_ilocked(node); + node->proc->tmp_ref++; + *procp = node->proc; + } else + *error = BR_DEAD_REPLY; + binder_node_inner_unlock(node); + + return target_node; +} + static void binder_transaction(struct binder_proc *proc, struct binder_thread *thread, struct binder_transaction_data *tr, int reply, @@ -2846,43 +2888,35 @@ static void binder_transaction(struct binder_proc *proc, ref = binder_get_ref_olocked(proc, tr->target.handle, true); if (ref) { - binder_inc_node(ref->node, 1, 0, NULL); - target_node = ref->node; + target_node = binder_get_node_refs_for_txn( + ref->node, &target_proc, + &return_error); + } else { + binder_user_error("%d:%d got transaction to invalid handle\n", + proc->pid, thread->pid); + return_error = BR_FAILED_REPLY; } binder_proc_unlock(proc); - if (target_node == NULL) { - binder_user_error("%d:%d got transaction to invalid handle\n", - proc->pid, thread->pid); - return_error = BR_FAILED_REPLY; - return_error_param = -EINVAL; - return_error_line = __LINE__; - goto err_invalid_target_handle; - } } else { mutex_lock(&context->context_mgr_node_lock); target_node = context->binder_context_mgr_node; - if (target_node == NULL) { + if (target_node) + target_node = binder_get_node_refs_for_txn( + target_node, &target_proc, + &return_error); + else return_error = BR_DEAD_REPLY; - mutex_unlock(&context->context_mgr_node_lock); - return_error_line = __LINE__; - goto err_no_context_mgr_node; - } - binder_inc_node(target_node, 1, 0, NULL); mutex_unlock(&context->context_mgr_node_lock); } - e->to_node = target_node->debug_id; - binder_node_lock(target_node); - target_proc = target_node->proc; - if (target_proc == NULL) { - binder_node_unlock(target_node); - return_error = BR_DEAD_REPLY; + if (!target_node) { + /* + * return_error is set above + */ + return_error_param = -EINVAL; return_error_line = __LINE__; goto err_dead_binder; } - binder_inner_proc_lock(target_proc); - target_proc->tmp_ref++; - binder_inner_proc_unlock(target_proc); - binder_node_unlock(target_node); + e->to_node = target_node->debug_id; if (security_binder_transaction(proc->tsk, target_proc->tsk) < 0) { return_error = BR_FAILED_REPLY; @@ -3241,6 +3275,8 @@ static void binder_transaction(struct binder_proc *proc, if (target_thread) binder_thread_dec_tmpref(target_thread); binder_proc_dec_tmpref(target_proc); + if (target_node) + binder_dec_node_tmpref(target_node); /* * write barrier to synchronize with initialization * of log entry @@ -3260,6 +3296,8 @@ static void binder_transaction(struct binder_proc *proc, err_copy_data_failed: trace_binder_transaction_failed_buffer_release(t->buffer); binder_transaction_buffer_release(target_proc, t->buffer, offp); + if (target_node) + binder_dec_node_tmpref(target_node); target_node = NULL; t->buffer->transaction = NULL; binder_alloc_free_buf(&target_proc->alloc, t->buffer); @@ -3274,13 +3312,14 @@ static void binder_transaction(struct binder_proc *proc, err_empty_call_stack: err_dead_binder: err_invalid_target_handle: -err_no_context_mgr_node: if (target_thread) binder_thread_dec_tmpref(target_thread); if (target_proc) binder_proc_dec_tmpref(target_proc); - if (target_node) + if (target_node) { binder_dec_node(target_node, 1, 0); + binder_dec_node_tmpref(target_node); + } binder_debug(BINDER_DEBUG_FAILED_TRANSACTION, "%d:%d transaction failed %d/%d, size %lld-%lld line %d\n", From 80bd5344d759ad02cd9f1971ff103aeac69e148e Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 09:55:38 -0800 Subject: [PATCH 474/546] f2fs: remove wrong backported codes Kconfig and dentry RCU mode fixes. Fixes: c1286ff41c2f610df ("f2fs: backport from 'for-f2fs-4.9'") Signed-off-by: Jaegeuk Kim --- fs/f2fs/Kconfig | 1 - fs/f2fs/namei.c | 3 --- 2 files changed, 4 deletions(-) diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 1852d99df97b..378c221d68a9 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -2,7 +2,6 @@ config F2FS_FS tristate "F2FS filesystem support" depends on BLOCK select CRYPTO - select KEYS select CRYPTO_CRC32 help F2FS is based on Log-structured File System (LFS), which supports diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 468b2dbe6d34..523bf073642e 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -1014,9 +1014,6 @@ static const char *f2fs_encrypted_follow_link(struct dentry *dentry, void **cook u32 max_size = inode->i_sb->s_blocksize; int res; - if (!dentry) - return ERR_PTR(-ECHILD); - res = fscrypt_get_encryption_info(inode); if (res) return ERR_PTR(res); From cfbece022650d5a8c44a326ac96b3baa5a22ee4a Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 18:24:54 -0800 Subject: [PATCH 475/546] f2fs: resolve op and op_flags confilcts commit 70fd76140a6cb63262bd47b68d57b42e889c10ee upstream. This patch backported ("block,fs: use REQ_* flags directly") Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 11 +++++--- fs/f2fs/data.c | 54 ++++++++++++++++++++----------------- fs/f2fs/f2fs.h | 24 +++++++++++++++-- fs/f2fs/gc.c | 12 ++++++--- fs/f2fs/inline.c | 3 ++- fs/f2fs/node.c | 12 +++++---- fs/f2fs/segment.c | 9 ++++--- fs/f2fs/trace.c | 7 ++--- include/trace/events/f2fs.h | 19 ++++++++----- 9 files changed, 98 insertions(+), 53 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 640f28576e88..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -64,14 +64,15 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = READ_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC | REQ_META | REQ_PRIO, .old_blkaddr = index, .new_blkaddr = index, .encrypted_page = NULL, }; if (unlikely(!is_meta)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; repeat: page = f2fs_grab_cache_page(mapping, index, false); if (!page) { @@ -158,13 +159,15 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = sync ? (READ_SYNC | REQ_META | REQ_PRIO) : READA, + .op = REQ_OP_READ, + .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : + REQ_RAHEAD, .encrypted_page = NULL, }; struct blk_plug plug; if (unlikely(type == META_POR)) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 283fc9de4762..2eddf1daf995 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -168,15 +168,15 @@ static struct bio *__bio_alloc(struct f2fs_sb_info *sbi, block_t blk_addr, return bio; } -static inline void __submit_bio(struct f2fs_sb_info *sbi, int rw, - struct bio *bio, enum page_type type) +static inline void __submit_bio(struct f2fs_sb_info *sbi, + struct bio *bio, enum page_type type) { - if (!is_read_io(rw)) { + if (!is_read_io(bio_op(bio))) { if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } - submit_bio(rw, bio); + submit_bio(0, bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -186,12 +186,14 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->rw)) + if (is_read_io(fio->op)) trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); else trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - __submit_bio(io->sbi, fio->rw, io->bio, fio->type); + bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } @@ -257,10 +259,10 @@ static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; - if (test_opt(sbi, NOBARRIER)) - io->fio.rw = WRITE_FLUSH | REQ_META | REQ_PRIO; - else - io->fio.rw = WRITE_FLUSH_FUA | REQ_META | REQ_PRIO; + io->fio.op = REQ_OP_WRITE; + io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + if (!test_opt(sbi, NOBARRIER)) + io->fio.op_flags |= REQ_FUA; } __submit_merged_bio(io); out: @@ -302,14 +304,15 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) f2fs_trace_ios(fio, 0); /* Allocate a new bio */ - bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->rw)); + bio = __bio_alloc(fio->sbi, fio->new_blkaddr, 1, is_read_io(fio->op)); if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); return -EFAULT; } + bio_set_op_attrs(bio, fio->op, fio->op_flags); - __submit_bio(fio->sbi, fio->rw, bio, fio->type); + __submit_bio(fio->sbi, bio, fio->type); return 0; } @@ -318,7 +321,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->rw); + bool is_read = is_read_io(fio->op); struct page *bio_page; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -335,7 +338,7 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) down_write(&io->io_rwsem); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || - (io->fio.rw != fio->rw) || + (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || !__same_bdev(sbi, fio->new_blkaddr, io->bio))) __submit_merged_bio(io); alloc_new: @@ -463,7 +466,7 @@ int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) } struct page *get_read_data_page(struct inode *inode, pgoff_t index, - int rw, bool for_write) + int op_flags, bool for_write) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; @@ -473,7 +476,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .encrypted_page = NULL, }; @@ -541,7 +545,7 @@ struct page *find_data_page(struct inode *inode, pgoff_t index) return page; f2fs_put_page(page, 0); - page = get_read_data_page(inode, index, READ_SYNC, false); + page = get_read_data_page(inode, index, REQ_SYNC, false); if (IS_ERR(page)) return page; @@ -567,7 +571,7 @@ struct page *get_lock_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct page *page; repeat: - page = get_read_data_page(inode, index, READ_SYNC, for_write); + page = get_read_data_page(inode, index, REQ_SYNC, for_write); if (IS_ERR(page)) return page; @@ -1145,7 +1149,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, if (bio && (last_block_in_bio != block_nr - 1 || !__same_bdev(F2FS_I_SB(inode), block_nr, bio))) { submit_and_realloc: - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } if (bio == NULL) { @@ -1154,6 +1158,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, bio = NULL; goto set_error_page; } + bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1168,7 +1173,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, goto next_page; confused: if (bio) { - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); bio = NULL; } unlock_page(page); @@ -1178,7 +1183,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, } BUG_ON(pages && !list_empty(pages)); if (bio) - __submit_bio(F2FS_I_SB(inode), READ, bio, DATA); + __submit_bio(F2FS_I_SB(inode), bio, DATA); return 0; } @@ -1296,7 +1301,8 @@ static int f2fs_write_data_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; @@ -1728,14 +1734,14 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, err = PTR_ERR(bio); goto fail; } - + bio->bi_rw = READ_SYNC; if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { bio_put(bio); err = -EFAULT; goto fail; } - __submit_bio(sbi, READ_SYNC, bio, DATA); + __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3ef2d93ab936..d0c7decdd3ac 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -24,6 +24,7 @@ #include #include #include +#include #ifdef CONFIG_F2FS_CHECK_FS #define f2fs_bug_on(sbi, condition) BUG_ON(condition) @@ -112,6 +113,24 @@ struct f2fs_mount_info { #define F2FS_CLEAR_FEATURE(sb, mask) \ F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +static inline int wbc_to_write_flags(struct writeback_control *wbc) +{ + if (wbc->sync_mode == WB_SYNC_ALL) + return REQ_SYNC; + return 0; +} + /** * wq_has_sleeper - check if there are any waiting processes * @wq: wait queue head @@ -746,14 +765,15 @@ enum page_type { struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ - int rw; /* contains R/RS/W/WS with REQ_META/REQ_PRIO */ + int op; /* contains REQ_OP_ */ + int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ }; -#define is_read_io(rw) (((rw) & 1) == READ) +#define is_read_io(rw) (rw == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 6390d45c1b68..d3a36e4b442c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -550,7 +550,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = READ_SYNC, + .op = REQ_OP_READ, + .op_flags = REQ_SYNC, .encrypted_page = NULL, }; struct dnode_of_data dn; @@ -627,7 +628,8 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, /* allocate block address */ f2fs_wait_on_page_writeback(dn.node_page, NODE, true); - fio.rw = WRITE_SYNC; + fio.op = REQ_OP_WRITE; + fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; f2fs_submit_page_mbio(&fio); @@ -668,7 +670,8 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, - .rw = WRITE_SYNC, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE, .page = page, .encrypted_page = NULL, }; @@ -767,7 +770,8 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, start_bidx = start_bidx_of_node(nofs, inode); data_page = get_read_data_page(inode, - start_bidx + ofs_in_node, READA, true); + start_bidx + ofs_in_node, REQ_RAHEAD, + true); if (IS_ERR(data_page)) { iput(inode); continue; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index b85987703d1e..fb5d7d1f34aa 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -128,7 +128,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index edacbabb92cf..26a745c544fc 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -1068,14 +1068,15 @@ struct page *new_node_page(struct dnode_of_data *dn, * 0: f2fs_put_page(page, 0) * LOCKED_PAGE or error: f2fs_put_page(page, 1) */ -static int read_node_page(struct page *page, int rw) +static int read_node_page(struct page *page, int op_flags) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = rw, + .op = REQ_OP_READ, + .op_flags = op_flags, .page = page, .encrypted_page = NULL, }; @@ -1116,7 +1117,7 @@ void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (!apage) return; - err = read_node_page(apage, READA); + err = read_node_page(apage, REQ_RAHEAD); f2fs_put_page(apage, err ? 1 : 0); } @@ -1134,7 +1135,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (!page) return ERR_PTR(-ENOMEM); - err = read_node_page(page, READ_SYNC); + err = read_node_page(page, REQ_SYNC); if (err < 0) { f2fs_put_page(page, 1); return ERR_PTR(err); @@ -1575,7 +1576,8 @@ static int f2fs_write_node_page(struct page *page, struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, - .rw = (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : WRITE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), .page = page, .encrypted_page = NULL, }; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a288de069164..70aec4a8de13 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -257,7 +257,8 @@ static int __commit_inmem_pages(struct inode *inode, struct f2fs_io_info fio = { .sbi = sbi, .type = DATA, - .rw = WRITE_SYNC | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .encrypted_page = NULL, }; bool submit_bio = false; @@ -407,6 +408,7 @@ static int __submit_flush_wait(struct block_device *bdev) struct bio *bio = f2fs_bio_alloc(0); int ret; + bio->bi_rw = REQ_OP_WRITE; bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); @@ -1544,7 +1546,8 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_io_info fio = { .sbi = sbi, .type = META, - .rw = WRITE_SYNC | REQ_META | REQ_PRIO, + .op = REQ_OP_WRITE, + .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_META | REQ_PRIO, .old_blkaddr = page->index, .new_blkaddr = page->index, .page = page, @@ -1552,7 +1555,7 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) - fio.rw &= ~REQ_META; + fio.op_flags &= ~REQ_META; set_page_writeback(page); f2fs_submit_page_mbio(&fio); diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 562ce0821559..73b4e1d1912a 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -25,11 +25,11 @@ static inline void __print_last_io(void) if (!last_io.len) return; - trace_printk("%3x:%3x %4x %-16s %2x %5x %12x %4x\n", + trace_printk("%3x:%3x %4x %-16s %2x %5x %5x %12x %4x\n", last_io.major, last_io.minor, last_io.pid, "----------------", last_io.type, - last_io.fio.rw, + last_io.fio.op, last_io.fio.op_flags, last_io.fio.new_blkaddr, last_io.len); memset(&last_io, 0, sizeof(last_io)); @@ -101,7 +101,8 @@ void f2fs_trace_ios(struct f2fs_io_info *fio, int flush) if (last_io.major == major && last_io.minor == minor && last_io.pid == pid && last_io.type == __file_type(inode, pid) && - last_io.fio.rw == fio->rw && + last_io.fio.op == fio->op && + last_io.fio.op_flags == fio->op_flags && last_io.fio.new_blkaddr + last_io.len == fio->new_blkaddr) { last_io.len++; diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 90d6ad49a9c5..7ad46e8a89e6 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -59,7 +59,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); #define F2FS_BIO_MASK(t) (t & (READA | WRITE_FLUSH_FUA)) #define F2FS_BIO_EXTRA_MASK(t) (t & (REQ_META | REQ_PRIO)) -#define show_bio_type(type) show_bio_base(type), show_bio_extra(type) +#define show_bio_type(op, op_flags) \ + show_bio_base((op|op_flags)), show_bio_extra((op|op_flags)) #define show_bio_base(type) \ __print_symbolic(F2FS_BIO_MASK(type), \ @@ -734,7 +735,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(pgoff_t, index) __field(block_t, old_blkaddr) __field(block_t, new_blkaddr) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) ), @@ -744,7 +746,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->index = page->index; __entry->old_blkaddr = fio->old_blkaddr; __entry->new_blkaddr = fio->new_blkaddr; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; ), @@ -754,7 +757,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type)) ); @@ -785,7 +788,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_STRUCT__entry( __field(dev_t, dev) - __field(int, rw) + __field(int, op) + __field(int, op_flags) __field(int, type) __field(sector_t, sector) __field(unsigned int, size) @@ -793,7 +797,8 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->rw = fio->rw; + __entry->op = fio->op; + __entry->op_flags = fio->op_flags; __entry->type = fio->type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; @@ -801,7 +806,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", show_dev(__entry), - show_bio_type(__entry->rw), + show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, __entry->size) From 64d8b297c4c2af070c6a812044cebfef5498cc5b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 16:41:25 -0800 Subject: [PATCH 476/546] f2fs: support async discard based on v4.9 commit 275b66b09e85cf0520dc610dd89706952751a473 upstream. This patch is based on commit 275b66b09e85 (f2fs: support async discard). Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 +- fs/f2fs/f2fs.h | 3 +- fs/f2fs/segment.c | 183 ++++++++++++++++++++++++++++++++++++++++--- 3 files changed, 181 insertions(+), 12 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..d485bea3d6bb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,6 +1255,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1273,10 +1274,12 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) + if (err) { release_discard_addrs(sbi); - else + } else { clear_prefree_segments(sbi, cpc); + f2fs_wait_all_discard_bio(sbi); + } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d0c7decdd3ac..883d3ab388c1 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -127,7 +127,7 @@ static inline void bio_set_op_attrs(struct bio *bio, unsigned op, static inline int wbc_to_write_flags(struct writeback_control *wbc) { if (wbc->sync_mode == WB_SYNC_ALL) - return REQ_SYNC; + return REQ_SYNC | REQ_NOIDLE; return 0; } @@ -2174,6 +2174,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 70aec4a8de13..13bea6e5120e 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,6 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; +static struct kmem_cache *bio_entry_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -622,6 +623,162 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } +static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, + struct bio *bio) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + + INIT_LIST_HEAD(&be->list); + be->bio = bio; + init_completion(&be->event); + list_add_tail(&be->list, wait_list); + + return be; +} + +void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +{ + struct list_head *wait_list = &(SM_I(sbi)->wait_list); + struct bio_entry *be, *tmp; + + list_for_each_entry_safe(be, tmp, wait_list, list) { + struct bio *bio = be->bio; + int err; + + wait_for_completion_io(&be->event); + err = be->error; + if (err == -EOPNOTSUPP) + err = 0; + + if (err) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard failed, ret: %d", err); + + bio_put(bio); + list_del(&be->list); + kmem_cache_free(bio_entry_slab, be); + } +} + +static void f2fs_submit_bio_wait_endio(struct bio *bio) +{ + struct bio_entry *be = (struct bio_entry *)bio->bi_private; + + be->error = bio->bi_error; + complete(&be->event); +} + +/* copied from block/blk-lib.c in 4.10-rc1 */ +static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, + sector_t nr_sects, gfp_t gfp_mask, int flags, + struct bio **biop) +{ + struct request_queue *q = bdev_get_queue(bdev); + struct bio *bio = *biop; + unsigned int granularity; + int op = REQ_WRITE | REQ_DISCARD; + int alignment; + sector_t bs_mask; + + if (!q) + return -ENXIO; + + if (!blk_queue_discard(q)) + return -EOPNOTSUPP; + + if (flags & BLKDEV_DISCARD_SECURE) { + if (!blk_queue_secdiscard(q)) + return -EOPNOTSUPP; + op |= REQ_SECURE; + } + + bs_mask = (bdev_logical_block_size(bdev) >> 9) - 1; + if ((sector | nr_sects) & bs_mask) + return -EINVAL; + + /* Zero-sector (unknown) and one-sector granularities are the same. */ + granularity = max(q->limits.discard_granularity >> 9, 1U); + alignment = (bdev_discard_alignment(bdev) >> 9) % granularity; + + while (nr_sects) { + unsigned int req_sects; + sector_t end_sect, tmp; + + /* Make sure bi_size doesn't overflow */ + req_sects = min_t(sector_t, nr_sects, UINT_MAX >> 9); + + /** + * If splitting a request, and the next starting sector would be + * misaligned, stop the discard at the previous aligned sector. + */ + end_sect = sector + req_sects; + tmp = end_sect; + if (req_sects < nr_sects && + sector_div(tmp, granularity) != alignment) { + end_sect = end_sect - alignment; + sector_div(end_sect, granularity); + end_sect = end_sect * granularity + alignment; + req_sects = end_sect - sector; + } + + if (bio) { + int ret = submit_bio_wait(0, bio); + bio_put(bio); + if (ret) + return ret; + } + bio = f2fs_bio_alloc(0); + bio->bi_iter.bi_sector = sector; + bio->bi_bdev = bdev; + bio_set_op_attrs(bio, op, 0); + + bio->bi_iter.bi_size = req_sects << 9; + nr_sects -= req_sects; + sector = end_sect; + + /* + * We can loop for a long time in here, if someone does + * full device discards (like mkfs). Be nice and allow + * us to schedule out to avoid softlocking if preempt + * is disabled. + */ + cond_resched(); + } + + *biop = bio; + return 0; +} + +/* this function is copied from blkdev_issue_discard from block/blk-lib.c */ +static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t blkstart, block_t blklen) +{ + struct bio *bio = NULL; + int err; + + trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + + if (sbi->s_ndevs) { + int devi = f2fs_target_device_index(sbi, blkstart); + + blkstart -= FDEV(devi).start_blk; + } + err = __blkdev_issue_discard(bdev, + SECTOR_FROM_BLOCK(blkstart), + SECTOR_FROM_BLOCK(blklen), + GFP_NOFS, 0, &bio); + if (!err && bio) { + struct bio_entry *be = __add_bio_entry(sbi, bio); + + bio->bi_private = be; + bio->bi_end_io = f2fs_submit_bio_wait_endio; + submit_bio(REQ_SYNC, bio); + } + + return err; +} + #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) @@ -655,8 +812,7 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return blkdev_issue_discard(bdev, sector, nr_sects, - GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: trace_f2fs_issue_reset_zone(sbi->sb, blkstart); @@ -672,15 +828,12 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, static int __issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t start = SECTOR_FROM_BLOCK(blkstart); - sector_t len = SECTOR_FROM_BLOCK(blklen); - #ifdef CONFIG_BLK_DEV_ZONED if (f2fs_sb_mounted_blkzoned(sbi->sb) && bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return blkdev_issue_discard(bdev, start, len, GFP_NOFS, 0); + return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -720,8 +873,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, if (len) err = __issue_discard_async(sbi, bdev, start, len); - - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); return err; } @@ -822,11 +973,14 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct list_head *head = &(SM_I(sbi)->discard_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; bool force = (cpc->reason == CP_DISCARD); + blk_start_plug(&plug); + mutex_lock(&dirty_i->seglist_lock); while (1) { @@ -875,6 +1029,8 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) SM_I(sbi)->nr_discards -= entry->len; kmem_cache_free(discard_entry_slab, entry); } + + blk_finish_plug(&plug); } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -2551,6 +2707,7 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; INIT_LIST_HEAD(&sm_info->discard_list); + INIT_LIST_HEAD(&sm_info->wait_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2694,10 +2851,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; + bio_entry_slab = f2fs_kmem_cache_create("bio_entry", + sizeof(struct bio_entry)); + if (!bio_entry_slab) + goto destroy_discard_entry; + sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_discard_entry; + goto destroy_bio_entry; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2707,6 +2869,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); +destroy_bio_entry: + kmem_cache_destroy(bio_entry_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2716,6 +2880,7 @@ int __init create_segment_manager_caches(void) void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); + kmem_cache_destroy(bio_entry_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From cb1f89c83e388dead32986e895533fb3bedc18ce Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 13 Dec 2016 17:23:37 +0800 Subject: [PATCH 477/546] f2fs: remove unused values in recover_fsync_data commit fed24668482e07421b8e746a4886e7725434050a upstream. This patch remove unused values in function recover_fsync_data Signed-off-by: Yunlei He Signed-off-by: Jaegeuk Kim --- fs/f2fs/recovery.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 983c35da6bce..4a3c48c24c10 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -552,10 +552,8 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) { - struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_WARM_NODE); struct list_head inode_list; struct list_head dir_list; - block_t blkaddr; int err; int ret = 0; bool need_writecp = false; @@ -571,8 +569,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) /* prevent checkpoint */ mutex_lock(&sbi->cp_mutex); - blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); - /* step #1: find fsynced inode numbers */ err = find_fsync_dnodes(sbi, &inode_list); if (err || list_empty(&inode_list)) From c85ef3084f33548fd4895ebde82c0022e988c4ca Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Tue, 13 Dec 2016 18:54:59 +0800 Subject: [PATCH 478/546] f2fs: don't cache nat entry if out of memory commit 5c9e418436f3445d7cc4f3ba2964f231a4b33f17 upstream. If we run out of memory, in cache_nat_entry, it's better to avoid loop for allocating memory to cache nat entry, so in low memory scenario, for read path of node block, I expect this can avoid unneeded latency. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/node.c | 27 ++++++++++++++++++++------- 1 file changed, 20 insertions(+), 7 deletions(-) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 26a745c544fc..b01b01cfc39e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -245,12 +245,24 @@ bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino) return need_update; } -static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid) +static struct nat_entry *grab_nat_entry(struct f2fs_nm_info *nm_i, nid_t nid, + bool no_fail) { struct nat_entry *new; - new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); - f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + if (no_fail) { + new = f2fs_kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + f2fs_radix_tree_insert(&nm_i->nat_root, nid, new); + } else { + new = kmem_cache_alloc(nat_entry_slab, GFP_NOFS); + if (!new) + return NULL; + if (radix_tree_insert(&nm_i->nat_root, nid, new)) { + kmem_cache_free(nat_entry_slab, new); + return NULL; + } + } + memset(new, 0, sizeof(struct nat_entry)); nat_set_nid(new, nid); nat_reset_flag(new); @@ -267,8 +279,9 @@ static void cache_nat_entry(struct f2fs_sb_info *sbi, nid_t nid, e = __lookup_nat_cache(nm_i, nid); if (!e) { - e = grab_nat_entry(nm_i, nid); - node_info_from_raw_nat(&e->ni, ne); + e = grab_nat_entry(nm_i, nid, false); + if (e) + node_info_from_raw_nat(&e->ni, ne); } else { f2fs_bug_on(sbi, nat_get_ino(e) != le32_to_cpu(ne->ino) || nat_get_blkaddr(e) != @@ -286,7 +299,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, down_write(&nm_i->nat_tree_lock); e = __lookup_nat_cache(nm_i, ni->nid); if (!e) { - e = grab_nat_entry(nm_i, ni->nid); + e = grab_nat_entry(nm_i, ni->nid, true); copy_node_info(&e->ni, ni); f2fs_bug_on(sbi, ni->blk_addr == NEW_ADDR); } else if (new_blkaddr == NEW_ADDR) { @@ -2155,7 +2168,7 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) ne = __lookup_nat_cache(nm_i, nid); if (!ne) { - ne = grab_nat_entry(nm_i, nid); + ne = grab_nat_entry(nm_i, nid, true); node_info_from_raw_nat(&ne->ni, &raw_ne); } From 9e36c28b49f1ed76f4db1550d41d2858556abaeb Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Fri, 16 Dec 2016 11:18:15 +0300 Subject: [PATCH 479/546] f2fs: remove unneeded condition commit 07fe8d44409f88be8f4a4e8f22b47ee709a22657 upstream. We checked that "inode" is not an error pointer earlier so there is no need to check again here. Signed-off-by: Dan Carpenter Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 523bf073642e..ca9e2f85eae8 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -321,9 +321,9 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (err) goto err_out; } - if (!IS_ERR(inode) && f2fs_encrypted_inode(dir) && - (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && - !fscrypt_has_permitted_context(dir, inode)) { + if (f2fs_encrypted_inode(dir) && + (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && + !fscrypt_has_permitted_context(dir, inode)) { bool nokey = f2fs_encrypted_inode(inode) && !fscrypt_has_encryption_key(inode); err = nokey ? -ENOKEY : -EPERM; From b1e97c8a7e672c4277363b3577f7b4805ada7add Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Mon, 19 Dec 2016 20:10:48 +0800 Subject: [PATCH 480/546] f2fs: fix a problem of using memory after free commit 7855eba4d6102f811b6dd142d6c749f53b591fa3 upstream. This patch fix a problem of using memory after free in function __try_merge_extent_node. Fixes: 0f825ee6e873 ("f2fs: add new interfaces for extent tree") Cc: Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 4db44da7ef69..e02c3d88dc9a 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -352,11 +352,12 @@ static struct extent_node *__try_merge_extent_node(struct inode *inode, } if (next_ex && __is_front_mergeable(ei, &next_ex->ei)) { - if (en) - __release_extent_node(sbi, et, prev_ex); next_ex->ei.fofs = ei->fofs; next_ex->ei.blk = ei->blk; next_ex->ei.len += ei->len; + if (en) + __release_extent_node(sbi, et, prev_ex); + en = next_ex; } From 71744660e15a45748cd732665cbdab46f51355df Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Tue, 20 Dec 2016 11:11:35 +0800 Subject: [PATCH 481/546] f2fs: add a case of no need to read a page in write begin commit 746e2403927efbd7c7f2e796314e3cfb3cfabaa4 upstream. If the range we write cover the whole valid data in the last page, we do not need to read it. Signed-off-by: Yunlei He [Jaegeuk Kim: nullify the remaining area (fix: xfstests/f2fs/001)] Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 2eddf1daf995..d7c20880e53e 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1723,6 +1723,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, if (len == PAGE_SIZE || PageUptodate(page)) return 0; + if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode)) { + zero_user_segment(page, len, PAGE_SIZE); + return 0; + } + if (blkaddr == NEW_ADDR) { zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); @@ -1777,7 +1782,7 @@ static int f2fs_write_end(struct file *file, * let generic_perform_write() try to copy data again through copied=0. */ if (!PageUptodate(page)) { - if (unlikely(copied != PAGE_SIZE)) + if (unlikely(copied != len)) copied = 0; else SetPageUptodate(page); From 7c033b7c61ac6ece83306a3ef30c2619b3a60512 Mon Sep 17 00:00:00 2001 From: Geliang Tang Date: Tue, 20 Dec 2016 21:57:42 +0800 Subject: [PATCH 482/546] f2fs: use rb_entry_safe commit ed0b56209fe79a1309653d4b03f5c3147f580f6b upstream. Use rb_entry_safe() instead of open-coding it. Signed-off-by: Geliang Tang Signed-off-by: Jaegeuk Kim --- fs/f2fs/extent_cache.c | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index e02c3d88dc9a..6ed6424807b6 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -311,28 +311,24 @@ static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, tmp_node = parent; if (parent && fofs > en->ei.fofs) tmp_node = rb_next(parent); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); tmp_node = parent; if (parent && fofs < en->ei.fofs) tmp_node = rb_prev(parent); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); return NULL; lookup_neighbors: if (fofs == en->ei.fofs) { /* lookup prev node for merging backward later */ tmp_node = rb_prev(&en->rb_node); - *prev_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } if (fofs == en->ei.fofs + en->ei.len - 1) { /* lookup next node for merging frontward later */ tmp_node = rb_next(&en->rb_node); - *next_ex = tmp_node ? - rb_entry(tmp_node, struct extent_node, rb_node) : NULL; + *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); } return en; } @@ -493,9 +489,8 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (!next_en) { struct rb_node *node = rb_next(&en->rb_node); - next_en = node ? - rb_entry(node, struct extent_node, rb_node) - : NULL; + next_en = rb_entry_safe(node, struct extent_node, + rb_node); } if (parts) From b0bc7fc867ee1e8ef9bcee30207084bb6ee8e598 Mon Sep 17 00:00:00 2001 From: Yunlei He Date: Thu, 22 Dec 2016 11:46:24 +0800 Subject: [PATCH 483/546] f2fs: fix a missing discard prefree segments commit 650d3c4e56e1e92ee6e004648c9deb243e5963e0 upstream. If userspace issue a fstrim with a range not involve prefree segments, it will reuse these segments without discard. This patch fix it. Signed-off-by: Yunlei He Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 13bea6e5120e..f4e41f997ae3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -996,9 +996,13 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) dirty_i->nr_dirty[PRE] -= end - start; - if (force || !test_opt(sbi, DISCARD)) + if (!test_opt(sbi, DISCARD)) continue; + if (force && start >= cpc->trim_start && + (end - 1) <= cpc->trim_end) + continue; + if (!test_opt(sbi, LFS) || sbi->segs_per_sec == 1) { f2fs_issue_discard(sbi, START_BLOCK(sbi, start), (end - start) << sbi->log_blocks_per_seg); @@ -2343,8 +2347,12 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, sit_i->dirty_sentries); out: if (cpc->reason == CP_DISCARD) { + __u64 trim_start = cpc->trim_start; + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) add_discard_addrs(sbi, cpc); + + cpc->trim_start = trim_start; } mutex_unlock(&sit_i->sentry_lock); From 919fe37467530bb916d576edd9813470006e5426 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 11:51:32 -0800 Subject: [PATCH 484/546] f2fs: reassign new segment for mode=lfs commit 9d52a504db6db9e4e254576130aa867838daff55 upstream. Otherwise we can remain wrong curseg->next_blkoff, resulting in fsck failure. Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index f4e41f997ae3..e6d3f3d4b028 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1508,9 +1508,6 @@ void allocate_new_segments(struct f2fs_sb_info *sbi) unsigned int old_segno; int i; - if (test_opt(sbi, LFS)) - return; - for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { curseg = CURSEG_I(sbi, i); old_segno = curseg->segno; From 1c2238bfa9623baf5d3e4b1d653ac181416c173b Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 12:13:03 -0800 Subject: [PATCH 485/546] f2fs: add submit_bio tracepoint commit 554b5125f5cfca6653461fd52bad24d4ef35ec29 upstream. This patch adds final submit_bio() tracepoint. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 14 +++++++----- include/trace/events/f2fs.h | 45 ++++++++++++++++++++++++------------- 2 files changed, 39 insertions(+), 20 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index d7c20880e53e..ccb6fb142d56 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -176,6 +176,10 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); } + if (is_read_io(bio_op(bio))) + trace_f2fs_submit_read_bio(sbi->sb, type, bio); + else + trace_f2fs_submit_write_bio(sbi->sb, type, bio); submit_bio(0, bio); } @@ -186,13 +190,13 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) if (!io->bio) return; - if (is_read_io(fio->op)) - trace_f2fs_submit_read_bio(io->sbi->sb, fio, io->bio); - else - trace_f2fs_submit_write_bio(io->sbi->sb, fio, io->bio); - bio_set_op_attrs(io->bio, fio->op, fio->op_flags); + if (is_read_io(fio->op)) + trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio); + else + trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio); + __submit_bio(io->sbi, io->bio, fio->type); io->bio = NULL; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 7ad46e8a89e6..217691582dd4 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -779,12 +779,11 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, TP_CONDITION(page->mapping) ); -DECLARE_EVENT_CLASS(f2fs__submit_bio, +DECLARE_EVENT_CLASS(f2fs__bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_STRUCT__entry( __field(dev_t, dev) @@ -797,9 +796,9 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, TP_fast_assign( __entry->dev = sb->s_dev; - __entry->op = fio->op; - __entry->op_flags = fio->op_flags; - __entry->type = fio->type; + __entry->op = bio_op(bio); + __entry->op_flags = bio->bi_rw; + __entry->type = type; __entry->sector = bio->bi_iter.bi_sector; __entry->size = bio->bi_iter.bi_size; ), @@ -812,22 +811,38 @@ DECLARE_EVENT_CLASS(f2fs__submit_bio, __entry->size) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_write_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_write_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); -DEFINE_EVENT_CONDITION(f2fs__submit_bio, f2fs_submit_read_bio, +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_prepare_read_bio, - TP_PROTO(struct super_block *sb, struct f2fs_io_info *fio, - struct bio *bio), + TP_PROTO(struct super_block *sb, int type, struct bio *bio), - TP_ARGS(sb, fio, bio), + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_read_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), + + TP_CONDITION(bio) +); + +DEFINE_EVENT_CONDITION(f2fs__bio, f2fs_submit_write_bio, + + TP_PROTO(struct super_block *sb, int type, struct bio *bio), + + TP_ARGS(sb, type, bio), TP_CONDITION(bio) ); From caa5a4a163216769414cfe3dd3d57c3bb6f86784 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 14 Dec 2016 10:12:56 -0800 Subject: [PATCH 486/546] f2fs: support IO alignment for DATA and NODE writes commit 0a595ebaaa6b53a2226d3fee2a2fd616ea5ba378 upstream. This patch implements IO alignment by filling dummy blocks in DATA and NODE write bios. If we can guarantee, for example, 32KB or 64KB for such the IOs, we can eliminate underlying dummy page problem which FTL conducts in order to close MLC or TLC partial written pages. Note that, - it requires "-o mode=lfs". - IO size should be power of 2, not exceed BIO_MAX_PAGES, 256. - read IO is still 4KB. - do checkpoint at fsync, if dummy NODE page was written. Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 55 +++++++++++++++++++++++++++++++++++++++-- fs/f2fs/f2fs.h | 4 ++- fs/f2fs/segment.c | 9 +++++-- fs/f2fs/segment.h | 3 +++ fs/f2fs/super.c | 13 +++++++++- include/linux/f2fs_fs.h | 6 +++++ 6 files changed, 84 insertions(+), 6 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ccb6fb142d56..6c8465d26bb1 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -94,6 +94,17 @@ static void f2fs_write_end_io(struct bio *bio) struct page *page = bvec->bv_page; enum count_type type = WB_DATA_TYPE(page); + if (IS_DUMMY_WRITTEN_PAGE(page)) { + set_page_private(page, (unsigned long)NULL); + ClearPagePrivate(page); + unlock_page(page); + mempool_free(page, sbi->write_io_dummy); + + if (unlikely(bio->bi_error)) + f2fs_stop_checkpoint(sbi, true); + continue; + } + fscrypt_pullback_bio_page(&page, true); if (unlikely(bio->bi_error)) { @@ -172,10 +183,42 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, struct bio *bio, enum page_type type) { if (!is_read_io(bio_op(bio))) { + unsigned int start; + if (f2fs_sb_mounted_blkzoned(sbi->sb) && current->plug && (type == DATA || type == NODE)) blk_finish_plug(current->plug); + + if (type != DATA && type != NODE) + goto submit_io; + + start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS; + start %= F2FS_IO_SIZE(sbi); + + if (start == 0) + goto submit_io; + + /* fill dummy pages */ + for (; start < F2FS_IO_SIZE(sbi); start++) { + struct page *page = + mempool_alloc(sbi->write_io_dummy, + GFP_NOIO | __GFP_ZERO | __GFP_NOFAIL); + f2fs_bug_on(sbi, !page); + + SetPagePrivate(page); + set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE); + lock_page(page); + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) + f2fs_bug_on(sbi, 1); + } + /* + * In the NODE case, we lose next block address chain. So, we + * need to do checkpoint in f2fs_sync_file. + */ + if (type == NODE) + set_sbi_flag(sbi, SBI_NEED_CP); } +submit_io: if (is_read_io(bio_op(bio))) trace_f2fs_submit_read_bio(sbi->sb, type, bio); else @@ -320,13 +363,14 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) return 0; } -void f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_mbio(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); struct f2fs_bio_info *io; bool is_read = is_read_io(fio->op); struct page *bio_page; + int err = 0; io = is_read ? &sbi->read_io : &sbi->write_io[btype]; @@ -347,6 +391,12 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { + if ((fio->type == DATA || fio->type == NODE) && + fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) { + err = -EAGAIN; + dec_page_count(sbi, WB_DATA_TYPE(bio_page)); + goto out_fail; + } io->bio = __bio_alloc(sbi, fio->new_blkaddr, BIO_MAX_PAGES, is_read); io->fio = *fio; @@ -360,9 +410,10 @@ void f2fs_submit_page_mbio(struct f2fs_io_info *fio) io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); - +out_fail: up_write(&io->io_rwsem); trace_f2fs_submit_page_mbio(fio->page, fio); + return err; } static void __set_data_blkaddr(struct dnode_of_data *dn) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 883d3ab388c1..f9a739ffca0f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -859,6 +859,8 @@ struct f2fs_sb_info { struct f2fs_bio_info read_io; /* for read bios */ struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + int write_io_size_bits; /* Write IO size bits */ + mempool_t *write_io_dummy; /* Dummy pages */ /* for checkpoint */ struct f2fs_checkpoint *ckpt; /* raw checkpoint pointer */ @@ -2241,7 +2243,7 @@ void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, struct page *, nid_t, enum page_type, int); void f2fs_flush_merged_bios(struct f2fs_sb_info *); int f2fs_submit_page_bio(struct f2fs_io_info *); -void f2fs_submit_page_mbio(struct f2fs_io_info *); +int f2fs_submit_page_mbio(struct f2fs_io_info *); struct block_device *f2fs_target_device(struct f2fs_sb_info *, block_t, struct bio *); int f2fs_target_device_index(struct f2fs_sb_info *, block_t); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index e6d3f3d4b028..a7bb97826445 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1684,15 +1684,20 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { int type = __get_segment_type(fio->page, fio->type); + int err; if (fio->type == NODE || fio->type == DATA) mutex_lock(&fio->sbi->wio_mutex[fio->type]); - +reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type); /* writeout dirty page into bdev */ - f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_mbio(fio); + if (err == -EAGAIN) { + fio->old_blkaddr = fio->new_blkaddr; + goto reallocate; + } if (fio->type == NODE || fio->type == DATA) mutex_unlock(&fio->sbi->wio_mutex[fio->type]); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9d44ce83acb2..08f1455c812c 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -186,9 +186,12 @@ struct segment_allocation { * the page is atomically written, and it is in inmem_pages list. */ #define ATOMIC_WRITTEN_PAGE ((unsigned long)-1) +#define DUMMY_WRITTEN_PAGE ((unsigned long)-2) #define IS_ATOMIC_WRITTEN_PAGE(page) \ (page_private(page) == (unsigned long)ATOMIC_WRITTEN_PAGE) +#define IS_DUMMY_WRITTEN_PAGE(page) \ + (page_private(page) == (unsigned long)DUMMY_WRITTEN_PAGE) struct inmem_pages { struct list_head list; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index e6d8d011786c..fb9f6c09fa11 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -1764,6 +1764,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) FDEV(i).total_segments, FDEV(i).start_blk, FDEV(i).end_blk); } + f2fs_msg(sbi->sb, KERN_INFO, + "IO Block Size: %8d KB", F2FS_IO_SIZE_KB(sbi)); return 0; } @@ -1881,12 +1883,19 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (err) goto free_options; + if (F2FS_IO_SIZE(sbi) > 1) { + sbi->write_io_dummy = + mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); + if (!sbi->write_io_dummy) + goto free_options; + } + /* get an inode for meta space */ sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi)); if (IS_ERR(sbi->meta_inode)) { f2fs_msg(sb, KERN_ERR, "Failed to read F2FS meta data inode"); err = PTR_ERR(sbi->meta_inode); - goto free_options; + goto free_io_dummy; } err = get_valid_checkpoint(sbi); @@ -2104,6 +2113,8 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_meta_inode: make_bad_inode(sbi->meta_inode); iput(sbi->meta_inode); +free_io_dummy: + mempool_destroy(sbi->write_io_dummy); free_options: destroy_percpu_info(sbi); kfree(options); diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index 3e5972ef5019..cf54a312993f 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -36,6 +36,12 @@ #define F2FS_NODE_INO(sbi) (sbi->node_ino_num) #define F2FS_META_INO(sbi) (sbi->meta_ino_num) +#define F2FS_IO_SIZE(sbi) (1 << (sbi)->write_io_size_bits) /* Blocks */ +#define F2FS_IO_SIZE_KB(sbi) (1 << ((sbi)->write_io_size_bits + 2)) /* KB */ +#define F2FS_IO_SIZE_BYTES(sbi) (1 << ((sbi)->write_io_size_bits + 12)) /* B */ +#define F2FS_IO_SIZE_BITS(sbi) ((sbi)->write_io_size_bits) /* power of 2 */ +#define F2FS_IO_SIZE_MASK(sbi) (F2FS_IO_SIZE(sbi) - 1) + /* This flag is used by node and meta inodes, and by recovery */ #define GFP_F2FS_ZERO (GFP_NOFS | __GFP_ZERO) #define GFP_F2FS_HIGH_ZERO (GFP_NOFS | __GFP_ZERO | __GFP_HIGHMEM) From d595df246866177c15222dab18b63aea2d68a037 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 21 Dec 2016 17:09:19 -0800 Subject: [PATCH 487/546] f2fs: get io size bit from mount option commit ec91538dccd44329ad83d3aae1aa6a8389b5c75f upstream. This patch adds to set io_size_bits from mount option. Signed-off-by: Jaegeuk Kim --- Documentation/filesystems/f2fs.txt | 2 ++ fs/f2fs/super.c | 22 ++++++++++++++++++++++ 2 files changed, 24 insertions(+) diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index 753dd4f96afe..d99faced79cb 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -157,6 +157,8 @@ data_flush Enable data flushing before checkpoint in order to mode=%s Control block allocation mode which supports "adaptive" and "lfs". In "lfs" mode, there should be no random writes towards main area. +io_bits=%u Set the bit size of write IO requests. It should be set + with "mode=lfs". ================================================================================ DEBUGFS ENTRIES diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index fb9f6c09fa11..3b169927408e 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -101,6 +101,7 @@ enum { Opt_noinline_data, Opt_data_flush, Opt_mode, + Opt_io_size_bits, Opt_fault_injection, Opt_lazytime, Opt_nolazytime, @@ -133,6 +134,7 @@ static match_table_t f2fs_tokens = { {Opt_noinline_data, "noinline_data"}, {Opt_data_flush, "data_flush"}, {Opt_mode, "mode=%s"}, + {Opt_io_size_bits, "io_bits=%u"}, {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, @@ -535,6 +537,17 @@ static int parse_options(struct super_block *sb, char *options) } kfree(name); break; + case Opt_io_size_bits: + if (args->from && match_int(args, &arg)) + return -EINVAL; + if (arg > __ilog2_u32(BIO_MAX_PAGES)) { + f2fs_msg(sb, KERN_WARNING, + "Not support %d, larger than %d", + 1 << arg, BIO_MAX_PAGES); + return -EINVAL; + } + sbi->write_io_size_bits = arg; + break; case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; @@ -558,6 +571,13 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } + + if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { + f2fs_msg(sb, KERN_ERR, + "Should set mode=lfs with %uKB-sized IO", + F2FS_IO_SIZE_KB(sbi)); + return -EINVAL; + } return 0; } @@ -918,6 +938,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (test_opt(sbi, LFS)) seq_puts(seq, "lfs"); seq_printf(seq, ",active_logs=%u", sbi->active_logs); + if (F2FS_IO_SIZE_BITS(sbi)) + seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); return 0; } From 2c4e5c2eb054bc4a42f4491188486b8ed8a75dc5 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 13:55:09 -0800 Subject: [PATCH 488/546] f2fs: show the max number of atomic operations commit 26a28a0c1eb756ba18bfb1f93309c4b4406b9cd9 upstream. This patch adds to show the max number of atomic operations which are conducting concurrently. Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 7 +++++++ fs/f2fs/f2fs.h | 17 +++++++++++++++++ fs/f2fs/file.c | 8 ++++++-- fs/f2fs/segment.c | 1 + 4 files changed, 31 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index fbd5184140d0..29cdf0c1da1d 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -50,6 +50,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); + si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; @@ -256,6 +258,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); + seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", + si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -414,6 +418,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inline_dir, 0); atomic_set(&sbi->inplace_count, 0); + atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->max_aw_cnt, 0); + mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); mutex_unlock(&f2fs_stat_mutex); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f9a739ffca0f..19e054b1c4f8 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -951,6 +951,8 @@ struct f2fs_sb_info { atomic_t inline_xattr; /* # of inline_xattr inodes */ atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ + atomic_t aw_cnt; /* # of atomic writes */ + atomic_t max_aw_cnt; /* max # of atomic writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif @@ -2303,6 +2305,7 @@ struct f2fs_stat_info { int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; int inline_xattr, inline_inode, inline_dir, orphans; + int aw_cnt, max_aw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2374,6 +2377,17 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) ((sbi)->block_count[(curseg)->alloc_type]++) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) +#define stat_inc_atomic_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_dec_atomic_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); +#define stat_update_max_atomic_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_aw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ @@ -2427,6 +2441,9 @@ void f2fs_destroy_root_stats(void); #define stat_dec_inline_inode(inode) #define stat_inc_inline_dir(inode) #define stat_dec_inline_dir(inode) +#define stat_inc_atomic_write(inode) +#define stat_dec_atomic_write(inode) +#define stat_update_max_atomic_write(inode) #define stat_inc_seg_type(sbi, curseg) #define stat_inc_block_count(sbi, curseg) #define stat_inc_inplace_blocks(sbi) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 5808d5c709a7..d7eacef08797 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1546,6 +1546,8 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (ret) clear_inode_flag(inode, FI_ATOMIC_FILE); out: + stat_inc_atomic_write(inode); + stat_update_max_atomic_write(inode); inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1575,9 +1577,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) set_inode_flag(inode, FI_ATOMIC_FILE); goto err_out; } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + stat_dec_atomic_write(inode); + } else { + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } - - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); err_out: inode_unlock(inode); mnt_drop_write_file(filp); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index a7bb97826445..353ec85b3835 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -243,6 +243,7 @@ void drop_inmem_pages(struct inode *inode) struct f2fs_inode_info *fi = F2FS_I(inode); clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); From e447d60a08fcd7dfdad3dfc17a8a1b1a3fa83f98 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 28 Dec 2016 17:31:15 -0800 Subject: [PATCH 489/546] f2fs: don't allow encrypted operations without keys commit 363fa4e078cbdc97a172c19d19dc04b41b52ebc8 upstream. This patch fixes the renaming bug on encrypted filenames, which was pointed by (ext4: don't allow encrypted operations without keys) Cc: Theodore Ts'o Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/namei.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index ca9e2f85eae8..db3079cd665d 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -660,6 +660,12 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, bool is_old_inline = f2fs_has_inline_dentry(old_dir); int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((old_dir != new_dir) && f2fs_encrypted_inode(new_dir) && !fscrypt_has_permitted_context(new_dir, old_inode)) { err = -EPERM; @@ -840,6 +846,12 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, int old_nlink = 0, new_nlink = 0; int err = -ENOENT; + if ((f2fs_encrypted_inode(old_dir) && + !fscrypt_has_encryption_key(old_dir)) || + (f2fs_encrypted_inode(new_dir) && + !fscrypt_has_encryption_key(new_dir))) + return -ENOKEY; + if ((f2fs_encrypted_inode(old_dir) || f2fs_encrypted_inode(new_dir)) && (old_dir != new_dir) && (!fscrypt_has_permitted_context(new_dir, old_inode) || From 3e60c843efadf25e2e9a695ac47019f3144abfea Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Tue, 3 Jan 2017 17:19:30 -0800 Subject: [PATCH 490/546] f2fs: drop exist_data for inline_data when truncated to 0 commit bb95d9ab2a9d4afd03b59a603cccb2c601f68b78 upstream. A test program gets the SEEK_DATA with two values between a new created file and the exist file on f2fs filesystem. F2FS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 8192) SEEK_DATA size != 0 (offset = 4096) PNFS filesystem, (the first "test1" is a new file) SEEK_DATA size != 0 (offset = 4096) SEEK_DATA size != 0 (offset = 4096) int main(int argc, char **argv) { char *filename = argv[1]; int offset = 1, i = 0, fd = -1; if (argc < 2) { printf("Usage: %s f2fsfilename\n", argv[0]); return -1; } /* if (!access(filename, F_OK) || errno != ENOENT) { printf("Needs a new file for test, %m\n"); return -1; }*/ fd = open(filename, O_RDWR | O_CREAT, 0777); if (fd < 0) { printf("Create test file %s failed, %m\n", filename); return -1; } for (i = 0; i < 20; i++) { offset = 1 << i; ftruncate(fd, 0); lseek(fd, offset, SEEK_SET); write(fd, "test", 5); /* Get the alloc size by seek data equal zero*/ if (lseek(fd, 0, SEEK_DATA)) { printf("SEEK_DATA size != 0 (offset = %d)\n", offset); break; } } close(fd); return 0; } Reported-and-Tested-by: Kinglong Mee Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index d7eacef08797..9da13847cda4 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -571,6 +571,8 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) if (f2fs_has_inline_data(inode)) { if (truncate_inline_inode(ipage, from)) set_page_dirty(ipage); + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); f2fs_put_page(ipage, 1); truncate_page = true; goto out; From 551836d68b7c2d2dfa83d4a6b1171e0261b8153c Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 14:07:53 -0800 Subject: [PATCH 491/546] f2fs: relax async discard commands more commit 4e6a8d9b224f886362ea6e8f6046b541437c944f upstream. This patch relaxes async discard commands to avoid waiting its end_io during checkpoint. Instead of waiting them during checkpoint, it will be done when actually reusing them. Test on initial partition of nvme drive. # time fstrim /mnt/test Before : 6.158s After : 4.822s Reviewed-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 7 ++----- fs/f2fs/f2fs.h | 4 +++- fs/f2fs/segment.c | 24 +++++++++++++++++++----- fs/f2fs/super.c | 3 +++ 4 files changed, 27 insertions(+), 11 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index d485bea3d6bb..2ed785e5ffbb 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1255,7 +1255,6 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, prefree_segments(sbi)); flush_sit_entries(sbi, cpc); clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); unblock_operations(sbi); goto out; } @@ -1274,12 +1273,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* unlock all the fs_lock[] in do_checkpoint() */ err = do_checkpoint(sbi, cpc); - if (err) { + if (err) release_discard_addrs(sbi); - } else { + else clear_prefree_segments(sbi, cpc); - f2fs_wait_all_discard_bio(sbi); - } unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 19e054b1c4f8..3409392dde9c 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -250,6 +250,8 @@ struct discard_entry { struct bio_entry { struct list_head list; + block_t lstart; + block_t len; struct bio *bio; struct completion event; int error; @@ -2178,7 +2180,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); void invalidate_blocks(struct f2fs_sb_info *, block_t); bool is_checkpointed_data(struct f2fs_sb_info *, block_t); void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *); +void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 353ec85b3835..fa3d4f8db389 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -625,20 +625,23 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) } static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, - struct bio *bio) + struct bio *bio, block_t lstart, block_t len) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); INIT_LIST_HEAD(&be->list); be->bio = bio; + be->lstart = lstart; + be->len = len; init_completion(&be->event); list_add_tail(&be->list, wait_list); return be; } -void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { struct list_head *wait_list = &(SM_I(sbi)->wait_list); struct bio_entry *be, *tmp; @@ -647,7 +650,15 @@ void f2fs_wait_all_discard_bio(struct f2fs_sb_info *sbi) struct bio *bio = be->bio; int err; - wait_for_completion_io(&be->event); + if (!completion_done(&be->event)) { + if ((be->lstart <= blkaddr && + blkaddr < be->lstart + be->len) || + blkaddr == NULL_ADDR) + wait_for_completion_io(&be->event); + else + continue; + } + err = be->error; if (err == -EOPNOTSUPP) err = 0; @@ -756,6 +767,7 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { struct bio *bio = NULL; + block_t lblkstart = blkstart; int err; trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); @@ -770,13 +782,13 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio); + struct bio_entry *be = __add_bio_entry(sbi, bio, + lblkstart, blklen); bio->bi_private = be; bio->bi_end_io = f2fs_submit_bio_wait_endio; submit_bio(REQ_SYNC, bio); } - return err; } @@ -1655,6 +1667,8 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, *new_blkaddr = NEXT_FREE_BLKADDR(sbi, curseg); + f2fs_wait_discard_bio(sbi, *new_blkaddr); + /* * __add_sum_entry should be resided under the curseg_mutex * because, this function updates a summary entry in the diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 3b169927408e..84d5686c4aa4 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -770,6 +770,9 @@ static void f2fs_put_super(struct super_block *sb) write_checkpoint(sbi, &cpc); } + /* be sure to wait for any on-going discard commands */ + f2fs_wait_discard_bio(sbi, NULL_ADDR); + /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); From bab2a03c46d508f3585e4128dcb6ab51d8422fbe Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 16:58:54 -0800 Subject: [PATCH 492/546] f2fs: avoid needless checkpoint in f2fs_trim_fs commit 0333ad4e4f49e14217256e1db1134a70cf60b2de upstream. The f2fs_trim_fs() doesn't need to do checkpoint if there are newly allocated data blocks only which didn't change the critical checkpoint data such as nat and sit entries. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 2ed785e5ffbb..886b96c12c31 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1249,14 +1249,15 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_flush_merged_bios(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD && !is_sbi_flag_set(sbi, SBI_IS_DIRTY)) { - f2fs_bug_on(sbi, NM_I(sbi)->dirty_nat_cnt); - f2fs_bug_on(sbi, SIT_I(sbi)->dirty_sentries); - f2fs_bug_on(sbi, prefree_segments(sbi)); - flush_sit_entries(sbi, cpc); - clear_prefree_segments(sbi, cpc); - unblock_operations(sbi); - goto out; + if (cpc->reason == CP_DISCARD) { + if (NM_I(sbi)->dirty_nat_cnt == 0 && + SIT_I(sbi)->dirty_sentries == 0 && + prefree_segments(sbi) == 0) { + flush_sit_entries(sbi, cpc); + clear_prefree_segments(sbi, cpc); + unblock_operations(sbi); + goto out; + } } /* From 48743604b60c09d8d8ad7deba71e9ef7f121f437 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Thu, 29 Dec 2016 22:06:15 -0800 Subject: [PATCH 493/546] f2fs: return fs_trim if there is no candidate commit 25290fa5591d81767713db304e0d567bf991786f upstream. If there is no candidate to submit discard command during f2sf_trim_fs, let's return without checkpoint. Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 5 +++++ fs/f2fs/f2fs.h | 1 + fs/f2fs/segment.c | 28 +++++++++++++++++++++++----- 3 files changed, 29 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 886b96c12c31..fbf04d4d7964 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -1250,6 +1250,11 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* this is the case of multiple fstrims without any changes */ if (cpc->reason == CP_DISCARD) { + if (!exist_trim_candidates(sbi, cpc)) { + unblock_operations(sbi); + goto out; + } + if (NM_I(sbi)->dirty_nat_cnt == 0 && SIT_I(sbi)->dirty_sentries == 0 && prefree_segments(sbi) == 0) { diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3409392dde9c..3eb53e3a8eae 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2186,6 +2186,7 @@ void release_discard_addrs(struct f2fs_sb_info *); int npages_for_summary_flush(struct f2fs_sb_info *, bool); void allocate_new_segments(struct f2fs_sb_info *); int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); +bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); void update_meta_page(struct f2fs_sb_info *, void *, block_t); void write_meta_page(struct f2fs_sb_info *, struct page *); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index fa3d4f8db389..12f8d5ab7ccf 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -914,7 +914,8 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, SM_I(sbi)->nr_discards += end - start; } -static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) +static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, + bool check_only) { int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); int max_blocks = sbi->blocks_per_seg; @@ -928,12 +929,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) - return; + return false; if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) - return; + return false; } /* SIT_VBLOCK_MAP_SIZE should be multiple of sizeof(unsigned long) */ @@ -951,8 +952,12 @@ static void add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc) && (end - start) < cpc->trim_minlen) continue; + if (check_only) + return true; + __add_discard_entry(sbi, cpc, se, start, end); } + return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) @@ -1533,6 +1538,19 @@ static const struct segment_allocation default_salloc_ops = { .allocate_segment = allocate_segment_by_default, }; +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) +{ + __u64 trim_start = cpc->trim_start; + + mutex_lock(&SIT_I(sbi)->sentry_lock); + for (; trim_start <= cpc->trim_end; trim_start++) + if (add_discard_addrs(sbi, cpc, true)) + break; + mutex_unlock(&SIT_I(sbi)->sentry_lock); + + return trim_start <= cpc->trim_end; +} + int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) { __u64 start = F2FS_BYTES_TO_BLK(range->start); @@ -2329,7 +2347,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* add discard candidates */ if (cpc->reason != CP_DISCARD) { cpc->trim_start = segno; - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); } if (to_journal) { @@ -2367,7 +2385,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) - add_discard_addrs(sbi, cpc); + add_discard_addrs(sbi, cpc, false); cpc->trim_start = trim_start; } From f6873a1643386ca10212709bc307d8acfae68811 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:49:42 +0800 Subject: [PATCH 494/546] f2fs: clean up with list_{first, last}_entry commit 939afa943c5290a3b92f01612a792af17bc98115 upstream. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/checkpoint.c | 4 ++-- fs/f2fs/data.c | 4 ++-- fs/f2fs/node.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index fbf04d4d7964..45ef3b6bfb04 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -892,7 +892,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) F2FS_DIRTY_DENTS : F2FS_DIRTY_DATA)); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, dirty_list); + fi = list_first_entry(head, struct f2fs_inode_info, dirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { @@ -925,7 +925,7 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) spin_unlock(&sbi->inode_lock[DIRTY_META]); return 0; } - fi = list_entry(head->next, struct f2fs_inode_info, + fi = list_first_entry(head, struct f2fs_inode_info, gdirty_list); inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[DIRTY_META]); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 6c8465d26bb1..f1001c871cb6 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1144,7 +1144,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, prefetchw(&page->flags); if (pages) { - page = list_entry(pages->prev, struct page, lru); + page = list_last_entry(pages, struct page, lru); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1262,7 +1262,7 @@ static int f2fs_read_data_pages(struct file *file, struct list_head *pages, unsigned nr_pages) { struct inode *inode = file->f_mapping->host; - struct page *page = list_entry(pages->prev, struct page, lru); + struct page *page = list_last_entry(pages, struct page, lru); trace_f2fs_readpages(inode, page, nr_pages); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index e7997e240366..9278b21ee073 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -174,7 +174,7 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) spin_unlock(&nm_i->nid_list_lock); return; } - fnid = list_entry(nm_i->nid_list[FREE_NID_LIST].next, + fnid = list_first_entry(&nm_i->nid_list[FREE_NID_LIST], struct free_nid, list); *nid = fnid->nid; spin_unlock(&nm_i->nid_list_lock); From d255c3328a0fdbce1c94286192b9ef1645a4d7cb Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:50:26 +0800 Subject: [PATCH 495/546] f2fs: introduce FI_ATOMIC_COMMIT commit 5fe457430e554a2f5188f13c1a2e36ad845640c5 upstream. This patch introduces a new flag to indicate inode status of doing atomic write committing, so that, we can keep atomic write status for inode during atomic committing, then we can skip GCing pages of atomic write inode, that avoids random GCed datas being mixed with current transaction, so isolation of transaction can be kept. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/data.c | 2 +- fs/f2fs/f2fs.h | 6 ++++++ fs/f2fs/file.c | 11 ++++++----- fs/f2fs/gc.c | 6 ++++++ fs/f2fs/segment.c | 10 +++++++--- 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index f1001c871cb6..4ac72a3f920a 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -2017,7 +2017,7 @@ static int f2fs_set_data_page_dirty(struct page *page) if (!PageUptodate(page)) SetPageUptodate(page); - if (f2fs_is_atomic_file(inode)) { + if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) { if (!IS_ATOMIC_WRITTEN_PAGE(page)) { register_inmem_page(inode, page); return 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 3eb53e3a8eae..807855d37c63 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1706,6 +1706,7 @@ enum { FI_UPDATE_WRITE, /* inode has in-place-update data */ FI_NEED_IPU, /* used for ipu per file */ FI_ATOMIC_FILE, /* indicate atomic file */ + FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */ FI_VOLATILE_FILE, /* indicate volatile file */ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */ FI_DROP_CACHE, /* drop dirty page cache */ @@ -1895,6 +1896,11 @@ static inline bool f2fs_is_atomic_file(struct inode *inode) return is_inode_flag_set(inode, FI_ATOMIC_FILE); } +static inline bool f2fs_is_commit_atomic_write(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_ATOMIC_COMMIT); +} + static inline bool f2fs_is_volatile_file(struct inode *inode) { return is_inode_flag_set(inode, FI_VOLATILE_FILE); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 9da13847cda4..e4e5d76d80b0 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1573,14 +1573,15 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) goto err_out; if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); ret = commit_inmem_pages(inode); - if (ret) { - set_inode_flag(inode, FI_ATOMIC_FILE); + if (ret) goto err_out; - } + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); - stat_dec_atomic_write(inode); + if (!ret) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); + } } else { ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index d3a36e4b442c..7f0c3e02408c 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -569,6 +569,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + set_new_dnode(&dn, inode, NULL, NULL, 0); err = get_dnode_of_data(&dn, bidx, LOOKUP_NODE); if (err) @@ -661,6 +664,9 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, if (!check_valid_map(F2FS_I_SB(inode), segno, off)) goto out; + if (f2fs_is_atomic_file(inode)) + goto out; + if (gc_type == BG_GC) { if (PageWriteback(page)) goto out; diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 12f8d5ab7ccf..6a870677d58a 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -242,12 +242,12 @@ void drop_inmem_pages(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); - clear_inode_flag(inode, FI_ATOMIC_FILE); - stat_dec_atomic_write(inode); - mutex_lock(&fi->inmem_lock); __revoke_inmem_pages(inode, &fi->inmem_pages, true, false); mutex_unlock(&fi->inmem_lock); + + clear_inode_flag(inode, FI_ATOMIC_FILE); + stat_dec_atomic_write(inode); } static int __commit_inmem_pages(struct inode *inode, @@ -316,6 +316,8 @@ int commit_inmem_pages(struct inode *inode) f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); + set_inode_flag(inode, FI_ATOMIC_COMMIT); + mutex_lock(&fi->inmem_lock); err = __commit_inmem_pages(inode, &revoke_list); if (err) { @@ -337,6 +339,8 @@ int commit_inmem_pages(struct inode *inode) } mutex_unlock(&fi->inmem_lock); + clear_inode_flag(inode, FI_ATOMIC_COMMIT); + f2fs_unlock_op(sbi); return err; } From 28324dea8f25850cb668af32354d1daaf13acb2c Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:51:01 +0800 Subject: [PATCH 496/546] f2fs: check in-memory block bitmap commit 355e78913c0d57492076d545b6f44b94fec2bf6b upstream. This patch adds a mirror for valid block bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 32 ++++++++++++++++++++++++++++++-- fs/f2fs/segment.h | 6 ++++++ 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 6a870677d58a..aae1c2ea7a1d 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1101,14 +1101,32 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) + if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) + if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { +#ifdef CONFIG_F2FS_CHECK_FS + if (!f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir)) + f2fs_bug_on(sbi, 1); + else + WARN_ON(1); +#else f2fs_bug_on(sbi, 1); +#endif + } if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -2432,6 +2450,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) !sit_i->sentries[start].ckpt_valid_map) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sentries[start].cur_valid_map_mir + = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); + if (!sit_i->sentries[start].cur_valid_map_mir) + return -ENOMEM; +#endif + if (f2fs_discard_en(sbi)) { sit_i->sentries[start].discard_map = kzalloc(SIT_VBLOCK_MAP_SIZE, GFP_KERNEL); @@ -2861,6 +2886,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) if (sit_i->sentries) { for (start = 0; start < MAIN_SEGS(sbi); start++) { kfree(sit_i->sentries[start].cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sentries[start].cur_valid_map_mir); +#endif kfree(sit_i->sentries[start].ckpt_valid_map); kfree(sit_i->sentries[start].discard_map); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 08f1455c812c..9af95194db06 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -164,6 +164,9 @@ struct seg_entry { unsigned int ckpt_valid_blocks:10; /* # of valid blocks last cp */ unsigned int padding:6; /* padding */ unsigned char *cur_valid_map; /* validity bitmap of blocks */ +#ifdef CONFIG_F2FS_CHECK_FS + unsigned char *cur_valid_map_mir; /* mirror of current valid bitmap */ +#endif /* * # of valid blocks and the validity bitmap stored in the the last * checkpoint pack. This information is used by the SSR mode. @@ -320,6 +323,9 @@ static inline void seg_info_from_raw_sit(struct seg_entry *se, se->ckpt_valid_blocks = GET_SIT_VBLOCKS(rs); memcpy(se->cur_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); memcpy(se->ckpt_valid_map, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#ifdef CONFIG_F2FS_CHECK_FS + memcpy(se->cur_valid_map_mir, rs->valid_map, SIT_VBLOCK_MAP_SIZE); +#endif se->type = GET_SIT_TYPE(rs); se->mtime = le64_to_cpu(rs->mtime); } From 2bb15049c4d1dcc75bc5951a9f7122c9d8212ff6 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:01 +0800 Subject: [PATCH 497/546] f2fs: check in-memory nat version bitmap commit 599a09b2c1ac222e6aad0c22515d1ccde7c3b702 upstream. This patch adds a mirror for nat version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/f2fs.h | 3 +++ fs/f2fs/node.c | 11 +++++++++++ fs/f2fs/node.h | 15 +++++++++++++++ 3 files changed, 29 insertions(+) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 807855d37c63..d4783d9cf4e0 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -607,6 +607,9 @@ struct f2fs_nm_info { /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *nat_bitmap_mir; /* NAT bitmap mirror */ +#endif int bitmap_size; /* bitmap size */ }; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index b01b01cfc39e..bc67dc323f7e 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -2366,6 +2366,14 @@ static int init_node_manager(struct f2fs_sb_info *sbi) GFP_KERNEL); if (!nm_i->nat_bitmap) return -ENOMEM; + +#ifdef CONFIG_F2FS_CHECK_FS + nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, + GFP_KERNEL); + if (!nm_i->nat_bitmap_mir) + return -ENOMEM; +#endif + return 0; } @@ -2440,6 +2448,9 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) up_write(&nm_i->nat_tree_lock); kfree(nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(nm_i->nat_bitmap_mir); +#endif sbi->nm_info = NULL; kfree(nm_i); } diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 9278b21ee073..29ff783eb9c3 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -186,6 +186,12 @@ static inline void next_free_nid(struct f2fs_sb_info *sbi, nid_t *nid) static inline void get_nat_bitmap(struct f2fs_sb_info *sbi, void *addr) { struct f2fs_nm_info *nm_i = NM_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(nm_i->nat_bitmap, nm_i->nat_bitmap_mir, + nm_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(addr, nm_i->nat_bitmap, nm_i->bitmap_size); } @@ -203,6 +209,12 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) (seg_off << sbi->log_blocks_per_seg << 1) + (block_off & (sbi->blocks_per_seg - 1))); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != + f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -228,6 +240,9 @@ static inline void set_to_next_nat(struct f2fs_nm_info *nm_i, nid_t start_nid) unsigned int block_off = NAT_BLOCK_OFFSET(start_nid); f2fs_change_bit(block_off, nm_i->nat_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, nm_i->nat_bitmap_mir); +#endif } static inline nid_t ino_of_node(struct page *node_page) From 1a50ad0f416742b441cb41a9f13d73aec7e2825a Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Sat, 7 Jan 2017 18:52:34 +0800 Subject: [PATCH 498/546] f2fs: check in-memory sit version bitmap commit ae27d62e6befd3cac4ffa702e644cc52019642e8 upstream. This patch adds a mirror for sit version bitmap, and use it to detect in-memory bitmap corruption which may be caused by bit-transition of cache or memory overflow. Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim --- fs/f2fs/segment.c | 16 ++++++++++++---- fs/f2fs/segment.h | 18 ++++++++++++++++++ 2 files changed, 30 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index aae1c2ea7a1d..c39bbffb0cac 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2421,7 +2421,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct sit_info *sit_i; unsigned int sit_segs, start; - char *src_bitmap, *dst_bitmap; + char *src_bitmap; unsigned int bitmap_size; /* allocate memory for SIT information */ @@ -2483,17 +2483,22 @@ static int build_sit_info(struct f2fs_sb_info *sbi) bitmap_size = __bitmap_size(sbi, SIT_BITMAP); src_bitmap = __bitmap_ptr(sbi, SIT_BITMAP); - dst_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); - if (!dst_bitmap) + sit_i->sit_bitmap = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap) return -ENOMEM; +#ifdef CONFIG_F2FS_CHECK_FS + sit_i->sit_bitmap_mir = kmemdup(src_bitmap, bitmap_size, GFP_KERNEL); + if (!sit_i->sit_bitmap_mir) + return -ENOMEM; +#endif + /* init SIT information */ sit_i->s_ops = &default_salloc_ops; sit_i->sit_base_addr = le32_to_cpu(raw_super->sit_blkaddr); sit_i->sit_blocks = sit_segs << sbi->log_blocks_per_seg; sit_i->written_valid_blocks = 0; - sit_i->sit_bitmap = dst_bitmap; sit_i->bitmap_size = bitmap_size; sit_i->dirty_sentries = 0; sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK; @@ -2901,6 +2906,9 @@ static void destroy_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = NULL; kfree(sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + kfree(sit_i->sit_bitmap_mir); +#endif kfree(sit_i); } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 9af95194db06..5cb5755c75d9 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -209,6 +209,9 @@ struct sit_info { block_t sit_blocks; /* # of blocks used by SIT area */ block_t written_valid_blocks; /* # of valid blocks in main area */ char *sit_bitmap; /* SIT bitmap pointer */ +#ifdef CONFIG_F2FS_CHECK_FS + char *sit_bitmap_mir; /* SIT bitmap mirror */ +#endif unsigned int bitmap_size; /* SIT bitmap size */ unsigned long *tmp_map; /* bitmap for temporal use */ @@ -423,6 +426,12 @@ static inline void get_sit_bitmap(struct f2fs_sb_info *sbi, void *dst_addr) { struct sit_info *sit_i = SIT_I(sbi); + +#ifdef CONFIG_F2FS_CHECK_FS + if (memcmp(sit_i->sit_bitmap, sit_i->sit_bitmap_mir, + sit_i->bitmap_size)) + f2fs_bug_on(sbi, 1); +#endif memcpy(dst_addr, sit_i->sit_bitmap, sit_i->bitmap_size); } @@ -643,6 +652,12 @@ static inline pgoff_t current_sit_addr(struct f2fs_sb_info *sbi, check_seg_range(sbi, start); +#ifdef CONFIG_F2FS_CHECK_FS + if (f2fs_test_bit(offset, sit_i->sit_bitmap) != + f2fs_test_bit(offset, sit_i->sit_bitmap_mir)) + f2fs_bug_on(sbi, 1); +#endif + /* calculate sit block address */ if (f2fs_test_bit(offset, sit_i->sit_bitmap)) blk_addr += sit_i->sit_blocks; @@ -668,6 +683,9 @@ static inline void set_to_next_sit(struct sit_info *sit_i, unsigned int start) unsigned int block_off = SIT_BLOCK_OFFSET(start); f2fs_change_bit(block_off, sit_i->sit_bitmap); +#ifdef CONFIG_F2FS_CHECK_FS + f2fs_change_bit(block_off, sit_i->sit_bitmap_mir); +#endif } static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi) From 5004036a93890a963ad23bf793e050f5c0312dac Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 14:13:03 -0800 Subject: [PATCH 499/546] f2fs: clean up flush/discard command namings commit b01a92019cac30398ef75b560d2668b399f4e393 upstream. This patch simply cleans up the names for flush/discard commands. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 2 +- fs/f2fs/f2fs.h | 20 +++++----- fs/f2fs/segment.c | 98 +++++++++++++++++++++++------------------------ 3 files changed, 59 insertions(+), 61 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 29cdf0c1da1d..883f1ea9e0b6 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -194,7 +194,7 @@ static void update_mem_info(struct f2fs_sb_info *sbi) si->cache_mem += sizeof(struct f2fs_gc_kthread); /* build merge flush thread */ - if (SM_I(sbi)->cmd_control_info) + if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); /* free nids */ diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index d4783d9cf4e0..167c5f841b5f 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -248,13 +248,12 @@ struct discard_entry { int len; /* # of consecutive blocks of the discard */ }; -struct bio_entry { - struct list_head list; - block_t lstart; - block_t len; - struct bio *bio; - struct completion event; - int error; +struct discard_cmd { + struct list_head list; /* command list */ + struct completion wait; /* compleation */ + block_t lstart; /* logical start address */ + block_t len; /* length */ + struct bio *bio; /* bio */ }; /* for the list of fsync inodes, used only during recovery */ @@ -701,8 +700,8 @@ struct f2fs_sm_info { unsigned int rec_prefree_segments; /* for small discard management */ - struct list_head discard_list; /* 4KB discard list */ - struct list_head wait_list; /* linked with issued discard bio */ + struct list_head discard_entry_list; /* 4KB discard entry list */ + struct list_head discard_cmd_list; /* discard cmd list */ int nr_discards; /* # of discards in the list */ int max_discards; /* max. discards to be issued */ @@ -716,8 +715,7 @@ struct f2fs_sm_info { unsigned int min_fsync_blocks; /* threshold for fsync */ /* for flush command control */ - struct flush_cmd_control *cmd_control_info; - + struct flush_cmd_control *fcc_info; }; /* diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c39bbffb0cac..289b3facd2d8 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -26,7 +26,7 @@ #define __reverse_ffz(x) __reverse_ffs(~(x)) static struct kmem_cache *discard_entry_slab; -static struct kmem_cache *bio_entry_slab; +static struct kmem_cache *discard_cmd_slab; static struct kmem_cache *sit_entry_set_slab; static struct kmem_cache *inmem_entry_slab; @@ -439,7 +439,7 @@ static int submit_flush_wait(struct f2fs_sb_info *sbi) static int issue_flush_thread(void *data) { struct f2fs_sb_info *sbi = data; - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; wait_queue_head_t *q = &fcc->flush_wait_queue; repeat: if (kthread_should_stop()) @@ -468,7 +468,7 @@ static int issue_flush_thread(void *data) int f2fs_issue_flush(struct f2fs_sb_info *sbi) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), @@ -511,8 +511,8 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) struct flush_cmd_control *fcc; int err = 0; - if (SM_I(sbi)->cmd_control_info) { - fcc = SM_I(sbi)->cmd_control_info; + if (SM_I(sbi)->fcc_info) { + fcc = SM_I(sbi)->fcc_info; goto init_thread; } @@ -522,14 +522,14 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) atomic_set(&fcc->submit_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); - SM_I(sbi)->cmd_control_info = fcc; + SM_I(sbi)->fcc_info = fcc; init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(fcc->f2fs_issue_flush)) { err = PTR_ERR(fcc->f2fs_issue_flush); kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; return err; } @@ -538,7 +538,7 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) { - struct flush_cmd_control *fcc = SM_I(sbi)->cmd_control_info; + struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; if (fcc && fcc->f2fs_issue_flush) { struct task_struct *flush_thread = fcc->f2fs_issue_flush; @@ -548,7 +548,7 @@ void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free) } if (free) { kfree(fcc); - SM_I(sbi)->cmd_control_info = NULL; + SM_I(sbi)->fcc_info = NULL; } } @@ -628,42 +628,43 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct bio_entry *__add_bio_entry(struct f2fs_sb_info *sbi, +static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, struct bio *bio, block_t lstart, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be = f2fs_kmem_cache_alloc(bio_entry_slab, GFP_NOFS); + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc; - INIT_LIST_HEAD(&be->list); - be->bio = bio; - be->lstart = lstart; - be->len = len; - init_completion(&be->event); - list_add_tail(&be->list, wait_list); + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); + INIT_LIST_HEAD(&dc->list); + dc->bio = bio; + dc->lstart = lstart; + dc->len = len; + init_completion(&dc->wait); + list_add_tail(&dc->list, wait_list); - return be; + return dc; } /* This should be covered by global mutex, &sit_i->sentry_lock */ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct list_head *wait_list = &(SM_I(sbi)->wait_list); - struct bio_entry *be, *tmp; + struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd *dc, *tmp; - list_for_each_entry_safe(be, tmp, wait_list, list) { - struct bio *bio = be->bio; + list_for_each_entry_safe(dc, tmp, wait_list, list) { + struct bio *bio = dc->bio; int err; - if (!completion_done(&be->event)) { - if ((be->lstart <= blkaddr && - blkaddr < be->lstart + be->len) || + if (!completion_done(&dc->wait)) { + if ((dc->lstart <= blkaddr && + blkaddr < dc->lstart + dc->len) || blkaddr == NULL_ADDR) - wait_for_completion_io(&be->event); + wait_for_completion_io(&dc->wait); else continue; } - err = be->error; + err = bio->bi_error; if (err == -EOPNOTSUPP) err = 0; @@ -672,17 +673,16 @@ void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) "Issue discard failed, ret: %d", err); bio_put(bio); - list_del(&be->list); - kmem_cache_free(bio_entry_slab, be); + list_del(&dc->list); + kmem_cache_free(discard_cmd_slab, dc); } } -static void f2fs_submit_bio_wait_endio(struct bio *bio) +static void f2fs_submit_discard_endio(struct bio *bio) { - struct bio_entry *be = (struct bio_entry *)bio->bi_private; + struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - be->error = bio->bi_error; - complete(&be->event); + complete(&dc->wait); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -786,11 +786,11 @@ static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, SECTOR_FROM_BLOCK(blklen), GFP_NOFS, 0, &bio); if (!err && bio) { - struct bio_entry *be = __add_bio_entry(sbi, bio, + struct discard_cmd *dc = __add_discard_cmd(sbi, bio, lblkstart, blklen); - bio->bi_private = be; - bio->bi_end_io = f2fs_submit_bio_wait_endio; + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; submit_bio(REQ_SYNC, bio); } return err; @@ -897,7 +897,7 @@ static void __add_discard_entry(struct f2fs_sb_info *sbi, struct cp_control *cpc, struct seg_entry *se, unsigned int start, unsigned int end) { - struct list_head *head = &SM_I(sbi)->discard_list; + struct list_head *head = &SM_I(sbi)->discard_entry_list; struct discard_entry *new, *last; if (!list_empty(head)) { @@ -966,7 +966,7 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,7 +992,7 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_list); + struct list_head *head = &(SM_I(sbi)->discard_entry_list); struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct blk_plug plug; @@ -2783,8 +2783,8 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - INIT_LIST_HEAD(&sm_info->discard_list); - INIT_LIST_HEAD(&sm_info->wait_list); + INIT_LIST_HEAD(&sm_info->discard_entry_list); + INIT_LIST_HEAD(&sm_info->discard_cmd_list); sm_info->nr_discards = 0; sm_info->max_discards = 0; @@ -2934,15 +2934,15 @@ int __init create_segment_manager_caches(void) if (!discard_entry_slab) goto fail; - bio_entry_slab = f2fs_kmem_cache_create("bio_entry", - sizeof(struct bio_entry)); - if (!bio_entry_slab) + discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd", + sizeof(struct discard_cmd)); + if (!discard_cmd_slab) goto destroy_discard_entry; sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set", sizeof(struct sit_entry_set)); if (!sit_entry_set_slab) - goto destroy_bio_entry; + goto destroy_discard_cmd; inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry", sizeof(struct inmem_pages)); @@ -2952,8 +2952,8 @@ int __init create_segment_manager_caches(void) destroy_sit_entry_set: kmem_cache_destroy(sit_entry_set_slab); -destroy_bio_entry: - kmem_cache_destroy(bio_entry_slab); +destroy_discard_cmd: + kmem_cache_destroy(discard_cmd_slab); destroy_discard_entry: kmem_cache_destroy(discard_entry_slab); fail: @@ -2963,7 +2963,7 @@ int __init create_segment_manager_caches(void) void destroy_segment_manager_caches(void) { kmem_cache_destroy(sit_entry_set_slab); - kmem_cache_destroy(bio_entry_slab); + kmem_cache_destroy(discard_cmd_slab); kmem_cache_destroy(discard_entry_slab); kmem_cache_destroy(inmem_entry_slab); } From 22d281fbca13aefadb676a0ad56c27b49e6044b8 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 11 Jan 2017 10:21:15 -0800 Subject: [PATCH 500/546] f2fs: reorganize stat information commit d4adb30f25f5f2aa9b205891e395251d2a9098be upstream. This patch modifies stat information more clearly. Signed-off-by: Jaegeuk Kim --- fs/f2fs/debug.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 883f1ea9e0b6..cd338ca24941 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -258,8 +258,6 @@ static int stat_show(struct seq_file *s, void *v) si->inline_dir); seq_printf(s, " - Orphan Inode: %u\n", si->orphans); - seq_printf(s, " - Atomic write count: %4d (Max. %4d)\n", - si->aw_cnt, si->max_aw_cnt); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -318,8 +316,10 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - inmem: %4d, wb_cp_data: %4d, wb_data: %4d\n", - si->inmem_pages, si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", + si->nr_wb_cp_data, si->nr_wb_data); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", From fac6ab8b31ace72e73bc0c104ea24f3f48e3b415 Mon Sep 17 00:00:00 2001 From: Steve Muckle Date: Wed, 27 Sep 2017 17:18:48 -0700 Subject: [PATCH 501/546] ANDROID: add script to fetch android kernel config fragments The Android kernel config fragments now live in a separate repository. To prevent others from having to search for this location, add a script to fetch and unpack the fragments. Update .gitignore to include these fragments. Change-Id: If2d4a59b86e4573b0a9b3190025dfe4191870b46 Signed-off-by: Steve Muckle --- .gitignore | 3 +++ android/configs/android-fetch-configs.sh | 4 ++++ 2 files changed, 7 insertions(+) create mode 100755 android/configs/android-fetch-configs.sh diff --git a/.gitignore b/.gitignore index fd3a35592543..fa3e5f1d0808 100644 --- a/.gitignore +++ b/.gitignore @@ -112,3 +112,6 @@ all.config # Kdevelop4 *.kdev4 + +# fetched Android config fragments +android/configs/android-*.cfg diff --git a/android/configs/android-fetch-configs.sh b/android/configs/android-fetch-configs.sh new file mode 100755 index 000000000000..9915c1356ed3 --- /dev/null +++ b/android/configs/android-fetch-configs.sh @@ -0,0 +1,4 @@ +#!/bin/sh + +curl https://android.googlesource.com/kernel/configs/+archive/master/android-4.4.tar.gz | tar xzv + From 406cbca78c6bbe482def63ba4489d50f098758e8 Mon Sep 17 00:00:00 2001 From: Juri Lelli Date: Wed, 3 May 2017 14:30:48 +0100 Subject: [PATCH 502/546] UPSTREAM: cpufreq: schedutil: use now as reference when aggregating shared policy requests Currently, sugov_next_freq_shared() uses last_freq_update_time as a reference to decide when to start considering CPU contributions as stale. However, since last_freq_update_time is set by the last CPU that issued a frequency transition, this might cause problems in certain cases. In practice, the detection of stale utilization values fails whenever the CPU with such values was the last to update the policy. For example (and please note again that the SCHED_CPUFREQ_RT flag is not the problem here, but only the detection of after how much time that flag has to be considered stale), suppose a policy with 2 CPUs: CPU0 | CPU1 | | RT task scheduled | SCHED_CPUFREQ_RT is set | CPU1->last_update = now | freq transition to max | last_freq_update_time = now | more than TICK_NSEC nsecs | a small CFS wakes up | CPU0->last_update = now1 | delta_ns(CPU0) < TICK_NSEC* | CPU0's util is considered | delta_ns(CPU1) = | last_freq_update_time - | CPU1->last_update = 0 | < TICK_NSEC | CPU1 is still considered | CPU1->SCHED_CPUFREQ_RT is set | we stay at max (until CPU1 | exits from idle) | * delta_ns is actually negative as now1 > last_freq_update_time While last_freq_update_time is a sensible reference for rate limiting, it doesn't seem to be useful for working around stale CPU states. Fix the problem by always considering now (time) as the reference for deciding when CPUs have stale contributions. Signed-off-by: Juri Lelli Acked-by: Vincent Guittot Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki (cherry picked from commit d86ab9cff8b936aadde444d0e263a8db5ff0349b) --- kernel/sched/cpufreq_schedutil.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index b90f7434e13b..28977799017b 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -322,11 +322,10 @@ static void sugov_update_single(struct update_util_data *hook, u64 time, sugov_update_commit(sg_policy, time, next_f); } -static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) +static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu, u64 time) { struct sugov_policy *sg_policy = sg_cpu->sg_policy; struct cpufreq_policy *policy = sg_policy->policy; - u64 last_freq_update_time = sg_policy->last_freq_update_time; unsigned long util = 0, max = 1; unsigned int j; @@ -342,7 +341,7 @@ static unsigned int sugov_next_freq_shared(struct sugov_cpu *sg_cpu) * enough, don't take the CPU into account as it probably is * idle now (and clear iowait_boost for it). */ - delta_ns = last_freq_update_time - j_sg_cpu->last_update; + delta_ns = time - j_sg_cpu->last_update; if (delta_ns > TICK_NSEC) { j_sg_cpu->iowait_boost = 0; j_sg_cpu->iowait_boost_pending = false; @@ -387,7 +386,7 @@ static void sugov_update_shared(struct update_util_data *hook, u64 time, if (flags & SCHED_CPUFREQ_DL) next_f = sg_policy->policy->cpuinfo.max_freq; else - next_f = sugov_next_freq_shared(sg_cpu); + next_f = sugov_next_freq_shared(sg_cpu, time); sugov_update_commit(sg_policy, time, next_f); } From b0fd6697e434ffdb304bea04e03f829fe06b5a29 Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Mon, 9 Jan 2017 18:16:29 -0800 Subject: [PATCH 503/546] f2fs: catch up to v4.14-rc1 This is cherry-picked from upstrea-f2fs-stable-linux-4.4.y. Changes include: commit c7fd9e2b4a6876 ("f2fs: hurry up to issue discard after io interruption") commit 603dde39653d6d ("f2fs: fix to show correct discard_granularity in sysfs") ... commit 565f0225f95f15 ("f2fs: factor out discard command info into discard_cmd_control") commit c4cc29d19eaf01 ("f2fs: remove batched discard in f2fs_trim_fs") Change-Id: Icd8a85ac0c19a8aa25cd2591a12b4e9b85bdf1c5 Signed-off-by: Jaegeuk Kim --- Documentation/ABI/testing/sysfs-fs-f2fs | 53 + Documentation/filesystems/f2fs.txt | 22 + fs/crypto/Makefile | 1 + fs/crypto/bio.c | 143 +++ fs/crypto/crypto.c | 278 ++--- fs/crypto/fname.c | 111 +- fs/crypto/fscrypt_private.h | 107 ++ fs/crypto/keyinfo.c | 270 +++-- fs/crypto/policy.c | 243 ++-- fs/f2fs/Makefile | 2 +- fs/f2fs/acl.c | 9 +- fs/f2fs/checkpoint.c | 217 +++- fs/f2fs/crypto_key.c | 240 ---- fs/f2fs/crypto_policy.c | 248 ----- fs/f2fs/data.c | 638 +++++++---- fs/f2fs/debug.c | 67 +- fs/f2fs/dir.c | 106 +- fs/f2fs/extent_cache.c | 368 +++--- fs/f2fs/f2fs.h | 1356 +++++++++++++++-------- fs/f2fs/f2fs_crypto.h | 150 --- fs/f2fs/file.c | 473 +++++--- fs/f2fs/gc.c | 265 +++-- fs/f2fs/gc.h | 27 +- fs/f2fs/hash.c | 7 +- fs/f2fs/inline.c | 174 +-- fs/f2fs/inode.c | 168 ++- fs/f2fs/namei.c | 182 ++- fs/f2fs/node.c | 723 ++++++++---- fs/f2fs/node.h | 65 +- fs/f2fs/recovery.c | 102 +- fs/f2fs/segment.c | 1292 ++++++++++++++++----- fs/f2fs/segment.h | 197 ++-- fs/f2fs/super.c | 1255 ++++++++++++++------- fs/f2fs/sysfs.c | 556 ++++++++++ fs/f2fs/trace.c | 4 +- fs/f2fs/xattr.c | 178 ++- fs/f2fs/xattr.h | 11 +- include/linux/f2fs_fs.h | 43 +- include/linux/fscrypt_common.h | 138 +++ include/linux/fscrypt_notsupp.h | 177 +++ include/linux/fscrypt_supp.h | 145 +++ include/linux/fscrypto.h | 12 +- include/trace/events/f2fs.h | 213 +++- include/uapi/linux/fs.h | 30 + mm/util.c | 1 + 45 files changed, 7588 insertions(+), 3479 deletions(-) create mode 100644 fs/crypto/bio.c create mode 100644 fs/crypto/fscrypt_private.h delete mode 100644 fs/f2fs/crypto_key.c delete mode 100644 fs/f2fs/crypto_policy.c delete mode 100644 fs/f2fs/f2fs_crypto.h create mode 100644 fs/f2fs/sysfs.c create mode 100644 include/linux/fscrypt_common.h create mode 100644 include/linux/fscrypt_notsupp.h create mode 100644 include/linux/fscrypt_supp.h diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs index 0345f2d1c727..500c60403653 100644 --- a/Documentation/ABI/testing/sysfs-fs-f2fs +++ b/Documentation/ABI/testing/sysfs-fs-f2fs @@ -57,6 +57,15 @@ Contact: "Jaegeuk Kim" Description: Controls the issue rate of small discard commands. +What: /sys/fs/f2fs//discard_granularity +Date: July 2017 +Contact: "Chao Yu" +Description: + Controls discard granularity of inner discard thread, inner thread + will not issue discards with size that is smaller than granularity. + The unit size is one block, now only support configuring in range + of [1, 512]. + What: /sys/fs/f2fs//max_victim_search Date: January 2014 Contact: "Jaegeuk Kim" @@ -92,3 +101,47 @@ Date: October 2015 Contact: "Chao Yu" Description: Controls the count of nid pages to be readaheaded. + +What: /sys/fs/f2fs//dirty_nats_ratio +Date: January 2016 +Contact: "Chao Yu" +Description: + Controls dirty nat entries ratio threshold, if current + ratio exceeds configured threshold, checkpoint will + be triggered for flushing dirty nat entries. + +What: /sys/fs/f2fs//lifetime_write_kbytes +Date: January 2016 +Contact: "Shuoran Liu" +Description: + Shows total written kbytes issued to disk. + +What: /sys/fs/f2fs//inject_rate +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection rate. + +What: /sys/fs/f2fs//inject_type +Date: May 2016 +Contact: "Sheng Yong" +Description: + Controls the injection type. + +What: /sys/fs/f2fs//reserved_blocks +Date: June 2017 +Contact: "Chao Yu" +Description: + Controls current reserved blocks in system. + +What: /sys/fs/f2fs//gc_urgent +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Do background GC agressively + +What: /sys/fs/f2fs//gc_urgent_sleep_time +Date: August 2017 +Contact: "Jaegeuk Kim" +Description: + Controls sleep time of GC urgent mode diff --git a/Documentation/filesystems/f2fs.txt b/Documentation/filesystems/f2fs.txt index d99faced79cb..6cf9ad12c57f 100644 --- a/Documentation/filesystems/f2fs.txt +++ b/Documentation/filesystems/f2fs.txt @@ -125,6 +125,7 @@ active_logs=%u Support configuring the number of active logs. In the disable_ext_identify Disable the extension list configured by mkfs, so f2fs does not aware of cold files such as media files. inline_xattr Enable the inline xattrs feature. +noinline_xattr Disable the inline xattrs feature. inline_data Enable the inline data feature: New created small(<~3.4k) files can be written into inode block. inline_dentry Enable the inline dir feature: data in new created @@ -159,6 +160,18 @@ mode=%s Control block allocation mode which supports "adaptive" writes towards main area. io_bits=%u Set the bit size of write IO requests. It should be set with "mode=lfs". +usrquota Enable plain user disk quota accounting. +grpquota Enable plain group disk quota accounting. +prjquota Enable plain project quota accounting. +usrjquota= Appoint specified file and type during mount, so that quota +grpjquota= information can be properly updated during recovery flow, +prjjquota= : must be in root directory; +jqfmt= : [vfsold,vfsv0,vfsv1]. +offusrjquota Turn off user journelled quota. +offgrpjquota Turn off group journelled quota. +offprjjquota Turn off project journelled quota. +quota Enable plain user disk quota accounting. +noquota Disable all plain disk quota option. ================================================================================ DEBUGFS ENTRIES @@ -204,6 +217,15 @@ Files in /sys/fs/f2fs/ gc_idle = 1 will select the Cost Benefit approach & setting gc_idle = 2 will select the greedy approach. + gc_urgent This parameter controls triggering background GCs + urgently or not. Setting gc_urgent = 0 [default] + makes back to default behavior, while if it is set + to 1, background thread starts to do GC by given + gc_urgent_sleep_time interval. + + gc_urgent_sleep_time This parameter controls sleep time for gc_urgent. + 500 ms is set by default. See above gc_urgent. + reclaim_segments This parameter controls the number of prefree segments to be reclaimed. If the number of prefree segments is larger than the number of segments diff --git a/fs/crypto/Makefile b/fs/crypto/Makefile index f17684c48739..9f6607f17b53 100644 --- a/fs/crypto/Makefile +++ b/fs/crypto/Makefile @@ -1,3 +1,4 @@ obj-$(CONFIG_FS_ENCRYPTION) += fscrypto.o fscrypto-y := crypto.o fname.o policy.o keyinfo.o +fscrypto-$(CONFIG_BLOCK) += bio.o diff --git a/fs/crypto/bio.c b/fs/crypto/bio.c new file mode 100644 index 000000000000..a91ed46fe503 --- /dev/null +++ b/fs/crypto/bio.c @@ -0,0 +1,143 @@ +/* + * This contains encryption functions for per-file encryption. + * + * Copyright (C) 2015, Google, Inc. + * Copyright (C) 2015, Motorola Mobility + * + * Written by Michael Halcrow, 2014. + * + * Filename encryption additions + * Uday Savagaonkar, 2014 + * Encryption policy handling additions + * Ildar Muslukhov, 2014 + * Add fscrypt_pullback_bio_page() + * Jaegeuk Kim, 2015. + * + * This has not yet undergone a rigorous security audit. + * + * The usage of AES-XTS should conform to recommendations in NIST + * Special Publication 800-38E and IEEE P1619/D16. + */ + +#include +#include +#include +#include +#include "fscrypt_private.h" + +/* + * Call fscrypt_decrypt_page on every single page, reusing the encryption + * context. + */ +static void completion_pages(struct work_struct *work) +{ + struct fscrypt_ctx *ctx = + container_of(work, struct fscrypt_ctx, r.work); + struct bio *bio = ctx->r.bio; + struct bio_vec *bv; + int i; + + bio_for_each_segment_all(bv, bio, i) { + struct page *page = bv->bv_page; + int ret = fscrypt_decrypt_page(page->mapping->host, page, + PAGE_SIZE, 0, page->index); + + if (ret) { + WARN_ON_ONCE(1); + SetPageError(page); + } else { + SetPageUptodate(page); + } + unlock_page(page); + } + fscrypt_release_ctx(ctx); + bio_put(bio); +} + +void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) +{ + INIT_WORK(&ctx->r.work, completion_pages); + ctx->r.bio = bio; + queue_work(fscrypt_read_workqueue, &ctx->r.work); +} +EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); + +void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + struct fscrypt_ctx *ctx; + struct page *bounce_page; + + /* The bounce data pages are unmapped. */ + if ((*page)->mapping) + return; + + /* The bounce data page is unmapped. */ + bounce_page = *page; + ctx = (struct fscrypt_ctx *)page_private(bounce_page); + + /* restore control page */ + *page = ctx->w.control_page; + + if (restore) + fscrypt_restore_control_page(bounce_page); +} +EXPORT_SYMBOL(fscrypt_pullback_bio_page); + +int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + struct fscrypt_ctx *ctx; + struct page *ciphertext_page = NULL; + struct bio *bio; + int ret, err = 0; + + BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); + + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return PTR_ERR(ctx); + + ciphertext_page = fscrypt_alloc_bounce_page(ctx, GFP_NOWAIT); + if (IS_ERR(ciphertext_page)) { + err = PTR_ERR(ciphertext_page); + goto errout; + } + + while (len--) { + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk, + ZERO_PAGE(0), ciphertext_page, + PAGE_SIZE, 0, GFP_NOFS); + if (err) + goto errout; + + bio = bio_alloc(GFP_NOWAIT, 1); + if (!bio) { + err = -ENOMEM; + goto errout; + } + bio->bi_bdev = inode->i_sb->s_bdev; + bio->bi_iter.bi_sector = + pblk << (inode->i_sb->s_blocksize_bits - 9); + bio_set_op_attrs(bio, REQ_OP_WRITE, 0); + ret = bio_add_page(bio, ciphertext_page, + inode->i_sb->s_blocksize, 0); + if (ret != inode->i_sb->s_blocksize) { + /* should never happen! */ + WARN_ON(1); + bio_put(bio); + err = -EIO; + goto errout; + } + err = submit_bio_wait(0, bio); + bio_put(bio); + if (err) + goto errout; + lblk++; + pblk++; + } + err = 0; +errout: + fscrypt_release_ctx(ctx); + return err; +} +EXPORT_SYMBOL(fscrypt_zeroout_range); diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 2d40ab9edc9f..c7835df7e7b8 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -24,10 +24,10 @@ #include #include #include -#include #include #include -#include +#include +#include "fscrypt_private.h" static unsigned int num_prealloc_crypto_pages = 32; static unsigned int num_prealloc_crypto_ctxs = 128; @@ -44,7 +44,7 @@ static mempool_t *fscrypt_bounce_page_pool = NULL; static LIST_HEAD(fscrypt_free_ctxs); static DEFINE_SPINLOCK(fscrypt_ctx_lock); -static struct workqueue_struct *fscrypt_read_workqueue; +struct workqueue_struct *fscrypt_read_workqueue; static DEFINE_MUTEX(fscrypt_init_mutex); static struct kmem_cache *fscrypt_ctx_cachep; @@ -63,7 +63,7 @@ void fscrypt_release_ctx(struct fscrypt_ctx *ctx) { unsigned long flags; - if (ctx->flags & FS_WRITE_PATH_FL && ctx->w.bounce_page) { + if (ctx->flags & FS_CTX_HAS_BOUNCE_BUFFER_FL && ctx->w.bounce_page) { mempool_free(ctx->w.bounce_page, fscrypt_bounce_page_pool); ctx->w.bounce_page = NULL; } @@ -88,7 +88,7 @@ EXPORT_SYMBOL(fscrypt_release_ctx); * Return: An allocated and initialized encryption context on success; error * value or NULL otherwise. */ -struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) +struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, gfp_t gfp_flags) { struct fscrypt_ctx *ctx = NULL; struct fscrypt_info *ci = inode->i_crypt_info; @@ -121,7 +121,7 @@ struct fscrypt_ctx *fscrypt_get_ctx(struct inode *inode, gfp_t gfp_flags) } else { ctx->flags &= ~FS_CTX_REQUIRES_FREE_ENCRYPT_FL; } - ctx->flags &= ~FS_WRITE_PATH_FL; + ctx->flags &= ~FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx; } EXPORT_SYMBOL(fscrypt_get_ctx); @@ -141,20 +141,15 @@ static void page_crypt_complete(struct crypto_async_request *req, int res) complete(&ecr->completion); } -typedef enum { - FS_DECRYPT = 0, - FS_ENCRYPT, -} fscrypt_direction_t; - -static int do_page_crypto(struct inode *inode, - fscrypt_direction_t rw, pgoff_t index, - struct page *src_page, struct page *dest_page, - gfp_t gfp_flags) +int fscrypt_do_page_crypto(const struct inode *inode, fscrypt_direction_t rw, + u64 lblk_num, struct page *src_page, + struct page *dest_page, unsigned int len, + unsigned int offs, gfp_t gfp_flags) { struct { __le64 index; - u8 padding[FS_XTS_TWEAK_SIZE - sizeof(__le64)]; - } xts_tweak; + u8 padding[FS_IV_SIZE - sizeof(__le64)]; + } iv; struct skcipher_request *req = NULL; DECLARE_FS_COMPLETION_RESULT(ecr); struct scatterlist dst, src; @@ -162,6 +157,18 @@ static int do_page_crypto(struct inode *inode, struct crypto_skcipher *tfm = ci->ci_ctfm; int res = 0; + BUG_ON(len == 0); + + BUILD_BUG_ON(sizeof(iv) != FS_IV_SIZE); + BUILD_BUG_ON(AES_BLOCK_SIZE != FS_IV_SIZE); + iv.index = cpu_to_le64(lblk_num); + memset(iv.padding, 0, sizeof(iv.padding)); + + if (ci->ci_essiv_tfm != NULL) { + crypto_cipher_encrypt_one(ci->ci_essiv_tfm, (u8 *)&iv, + (u8 *)&iv); + } + req = skcipher_request_alloc(tfm, gfp_flags); if (!req) { printk_ratelimited(KERN_ERR @@ -174,15 +181,11 @@ static int do_page_crypto(struct inode *inode, req, CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, page_crypt_complete, &ecr); - BUILD_BUG_ON(sizeof(xts_tweak) != FS_XTS_TWEAK_SIZE); - xts_tweak.index = cpu_to_le64(index); - memset(xts_tweak.padding, 0, sizeof(xts_tweak.padding)); - sg_init_table(&dst, 1); - sg_set_page(&dst, dest_page, PAGE_SIZE, 0); + sg_set_page(&dst, dest_page, len, offs); sg_init_table(&src, 1); - sg_set_page(&src, src_page, PAGE_SIZE, 0); - skcipher_request_set_crypt(req, &src, &dst, PAGE_SIZE, &xts_tweak); + sg_set_page(&src, src_page, len, offs); + skcipher_request_set_crypt(req, &src, &dst, len, &iv); if (rw == FS_DECRYPT) res = crypto_skcipher_decrypt(req); else @@ -202,53 +205,86 @@ static int do_page_crypto(struct inode *inode, return 0; } -static struct page *alloc_bounce_page(struct fscrypt_ctx *ctx, gfp_t gfp_flags) +struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags) { ctx->w.bounce_page = mempool_alloc(fscrypt_bounce_page_pool, gfp_flags); if (ctx->w.bounce_page == NULL) return ERR_PTR(-ENOMEM); - ctx->flags |= FS_WRITE_PATH_FL; + ctx->flags |= FS_CTX_HAS_BOUNCE_BUFFER_FL; return ctx->w.bounce_page; } /** * fscypt_encrypt_page() - Encrypts a page - * @inode: The inode for which the encryption should take place - * @plaintext_page: The page to encrypt. Must be locked. - * @gfp_flags: The gfp flag for memory allocation + * @inode: The inode for which the encryption should take place + * @page: The page to encrypt. Must be locked for bounce-page + * encryption. + * @len: Length of data to encrypt in @page and encrypted + * data in returned page. + * @offs: Offset of data within @page and returned + * page holding encrypted data. + * @lblk_num: Logical block number. This must be unique for multiple + * calls with same inode, except when overwriting + * previously written data. + * @gfp_flags: The gfp flag for memory allocation * - * Allocates a ciphertext page and encrypts plaintext_page into it using the ctx - * encryption context. + * Encrypts @page using the ctx encryption context. Performs encryption + * either in-place or into a newly allocated bounce page. + * Called on the page write path. * - * Called on the page write path. The caller must call + * Bounce page allocation is the default. + * In this case, the contents of @page are encrypted and stored in an + * allocated bounce page. @page has to be locked and the caller must call * fscrypt_restore_control_page() on the returned ciphertext page to * release the bounce buffer and the encryption context. * - * Return: An allocated page with the encrypted content on success. Else, an + * In-place encryption is used by setting the FS_CFLG_OWN_PAGES flag in + * fscrypt_operations. Here, the input-page is returned with its content + * encrypted. + * + * Return: A page with the encrypted content on success. Else, an * error value or NULL. */ -struct page *fscrypt_encrypt_page(struct inode *inode, - struct page *plaintext_page, gfp_t gfp_flags) +struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) + { struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; + struct page *ciphertext_page = page; int err; - BUG_ON(!PageLocked(plaintext_page)); + BUG_ON(len % FS_CRYPTO_BLOCK_SIZE != 0); + + if (inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES) { + /* with inplace-encryption we just encrypt the page */ + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, page, + ciphertext_page, len, offs, + gfp_flags); + if (err) + return ERR_PTR(err); + + return ciphertext_page; + } + + BUG_ON(!PageLocked(page)); ctx = fscrypt_get_ctx(inode, gfp_flags); if (IS_ERR(ctx)) return (struct page *)ctx; /* The encryption operation will require a bounce page. */ - ciphertext_page = alloc_bounce_page(ctx, gfp_flags); + ciphertext_page = fscrypt_alloc_bounce_page(ctx, gfp_flags); if (IS_ERR(ciphertext_page)) goto errout; - ctx->w.control_page = plaintext_page; - err = do_page_crypto(inode, FS_ENCRYPT, plaintext_page->index, - plaintext_page, ciphertext_page, - gfp_flags); + ctx->w.control_page = page; + err = fscrypt_do_page_crypto(inode, FS_ENCRYPT, lblk_num, + page, ciphertext_page, len, offs, + gfp_flags); if (err) { ciphertext_page = ERR_PTR(err); goto errout; @@ -265,8 +301,13 @@ struct page *fscrypt_encrypt_page(struct inode *inode, EXPORT_SYMBOL(fscrypt_encrypt_page); /** - * f2crypt_decrypt_page() - Decrypts a page in-place - * @page: The page to decrypt. Must be locked. + * fscrypt_decrypt_page() - Decrypts a page in-place + * @inode: The corresponding inode for the page to decrypt. + * @page: The page to decrypt. Must be locked in case + * it is a writeback page (FS_CFLG_OWN_PAGES unset). + * @len: Number of bytes in @page to be decrypted. + * @offs: Start of data in @page. + * @lblk_num: Logical block number. * * Decrypts page in-place using the ctx encryption context. * @@ -274,75 +315,17 @@ EXPORT_SYMBOL(fscrypt_encrypt_page); * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_decrypt_page(struct page *page) +int fscrypt_decrypt_page(const struct inode *inode, struct page *page, + unsigned int len, unsigned int offs, u64 lblk_num) { - BUG_ON(!PageLocked(page)); + if (!(inode->i_sb->s_cop->flags & FS_CFLG_OWN_PAGES)) + BUG_ON(!PageLocked(page)); - return do_page_crypto(page->mapping->host, - FS_DECRYPT, page->index, page, page, GFP_NOFS); + return fscrypt_do_page_crypto(inode, FS_DECRYPT, lblk_num, page, page, + len, offs, GFP_NOFS); } EXPORT_SYMBOL(fscrypt_decrypt_page); -int fscrypt_zeroout_range(struct inode *inode, pgoff_t lblk, - sector_t pblk, unsigned int len) -{ - struct fscrypt_ctx *ctx; - struct page *ciphertext_page = NULL; - struct bio *bio; - int ret, err = 0; - - BUG_ON(inode->i_sb->s_blocksize != PAGE_SIZE); - - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return PTR_ERR(ctx); - - ciphertext_page = alloc_bounce_page(ctx, GFP_NOWAIT); - if (IS_ERR(ciphertext_page)) { - err = PTR_ERR(ciphertext_page); - goto errout; - } - - while (len--) { - err = do_page_crypto(inode, FS_ENCRYPT, lblk, - ZERO_PAGE(0), ciphertext_page, - GFP_NOFS); - if (err) - goto errout; - - bio = bio_alloc(GFP_NOWAIT, 1); - if (!bio) { - err = -ENOMEM; - goto errout; - } - bio->bi_bdev = inode->i_sb->s_bdev; - bio->bi_iter.bi_sector = - pblk << (inode->i_sb->s_blocksize_bits - 9); - ret = bio_add_page(bio, ciphertext_page, - inode->i_sb->s_blocksize, 0); - if (ret != inode->i_sb->s_blocksize) { - /* should never happen! */ - WARN_ON(1); - bio_put(bio); - err = -EIO; - goto errout; - } - err = submit_bio_wait(WRITE, bio); - if ((err == 0) && bio->bi_error) - err = -EIO; - bio_put(bio); - if (err) - goto errout; - lblk++; - pblk++; - } - err = 0; -errout: - fscrypt_release_ctx(ctx); - return err; -} -EXPORT_SYMBOL(fscrypt_zeroout_range); - /* * Validate dentries for encrypted directories to make sure we aren't * potentially caching stale data after a key has been added or @@ -351,7 +334,6 @@ EXPORT_SYMBOL(fscrypt_zeroout_range); static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) { struct dentry *dir; - struct fscrypt_info *ci; int dir_has_key, cached_with_key; if (flags & LOOKUP_RCU) @@ -363,18 +345,11 @@ static int fscrypt_d_revalidate(struct dentry *dentry, unsigned int flags) return 0; } - ci = d_inode(dir)->i_crypt_info; - if (ci && ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD)))) - ci = NULL; - /* this should eventually be an flag in d_flags */ spin_lock(&dentry->d_lock); cached_with_key = dentry->d_flags & DCACHE_ENCRYPTED_WITH_KEY; spin_unlock(&dentry->d_lock); - dir_has_key = (ci != NULL); + dir_has_key = (d_inode(dir)->i_crypt_info != NULL); dput(dir); /* @@ -399,63 +374,6 @@ const struct dentry_operations fscrypt_d_ops = { }; EXPORT_SYMBOL(fscrypt_d_ops); -/* - * Call fscrypt_decrypt_page on every single page, reusing the encryption - * context. - */ -static void completion_pages(struct work_struct *work) -{ - struct fscrypt_ctx *ctx = - container_of(work, struct fscrypt_ctx, r.work); - struct bio *bio = ctx->r.bio; - struct bio_vec *bv; - int i; - - bio_for_each_segment_all(bv, bio, i) { - struct page *page = bv->bv_page; - int ret = fscrypt_decrypt_page(page); - - if (ret) { - WARN_ON_ONCE(1); - SetPageError(page); - } else { - SetPageUptodate(page); - } - unlock_page(page); - } - fscrypt_release_ctx(ctx); - bio_put(bio); -} - -void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, struct bio *bio) -{ - INIT_WORK(&ctx->r.work, completion_pages); - ctx->r.bio = bio; - queue_work(fscrypt_read_workqueue, &ctx->r.work); -} -EXPORT_SYMBOL(fscrypt_decrypt_bio_pages); - -void fscrypt_pullback_bio_page(struct page **page, bool restore) -{ - struct fscrypt_ctx *ctx; - struct page *bounce_page; - - /* The bounce data pages are unmapped. */ - if ((*page)->mapping) - return; - - /* The bounce data page is unmapped. */ - bounce_page = *page; - ctx = (struct fscrypt_ctx *)page_private(bounce_page); - - /* restore control page */ - *page = ctx->w.control_page; - - if (restore) - fscrypt_restore_control_page(bounce_page); -} -EXPORT_SYMBOL(fscrypt_pullback_bio_page); - void fscrypt_restore_control_page(struct page *page) { struct fscrypt_ctx *ctx; @@ -481,17 +399,22 @@ static void fscrypt_destroy(void) /** * fscrypt_initialize() - allocate major buffers for fs encryption. + * @cop_flags: fscrypt operations flags * * We only call this when we start accessing encrypted files, since it * results in memory getting allocated that wouldn't otherwise be used. * * Return: Zero on success, non-zero otherwise. */ -int fscrypt_initialize(void) +int fscrypt_initialize(unsigned int cop_flags) { int i, res = -ENOMEM; - if (fscrypt_bounce_page_pool) + /* + * No need to allocate a bounce page pool if there already is one or + * this FS won't use it. + */ + if (cop_flags & FS_CFLG_OWN_PAGES || fscrypt_bounce_page_pool) return 0; mutex_lock(&fscrypt_init_mutex); @@ -520,7 +443,6 @@ int fscrypt_initialize(void) mutex_unlock(&fscrypt_init_mutex); return res; } -EXPORT_SYMBOL(fscrypt_initialize); /** * fscrypt_init() - Set up for fs encryption. @@ -562,6 +484,8 @@ static void __exit fscrypt_exit(void) destroy_workqueue(fscrypt_read_workqueue); kmem_cache_destroy(fscrypt_ctx_cachep); kmem_cache_destroy(fscrypt_info_cachep); + + fscrypt_essiv_cleanup(); } module_exit(fscrypt_exit); diff --git a/fs/crypto/fname.c b/fs/crypto/fname.c index 9b774f4b50c8..ad9f814fdead 100644 --- a/fs/crypto/fname.c +++ b/fs/crypto/fname.c @@ -12,7 +12,7 @@ #include #include -#include +#include "fscrypt_private.h" /** * fname_crypt_complete() - completion callback for filename crypto @@ -159,6 +159,8 @@ static int fname_decrypt(struct inode *inode, static const char *lookup_table = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+,"; +#define BASE64_CHARS(nbytes) DIV_ROUND_UP((nbytes) * 4, 3) + /** * digest_encode() - * @@ -209,7 +211,7 @@ static int digest_decode(const char *src, int len, char *dst) return cp - dst; } -u32 fscrypt_fname_encrypted_size(struct inode *inode, u32 ilen) +u32 fscrypt_fname_encrypted_size(const struct inode *inode, u32 ilen) { int padding = 32; struct fscrypt_info *ci = inode->i_crypt_info; @@ -227,14 +229,17 @@ EXPORT_SYMBOL(fscrypt_fname_encrypted_size); * Allocates an output buffer that is sufficient for the crypto operation * specified by the context and the direction. */ -int fscrypt_fname_alloc_buffer(struct inode *inode, +int fscrypt_fname_alloc_buffer(const struct inode *inode, u32 ilen, struct fscrypt_str *crypto_str) { - unsigned int olen = fscrypt_fname_encrypted_size(inode, ilen); + u32 olen = fscrypt_fname_encrypted_size(inode, ilen); + const u32 max_encoded_len = + max_t(u32, BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE), + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))); crypto_str->len = olen; - if (olen < FS_FNAME_CRYPTO_DIGEST_SIZE * 2) - olen = FS_FNAME_CRYPTO_DIGEST_SIZE * 2; + olen = max(olen, max_encoded_len); + /* * Allocated buffer can hold one more character to null-terminate the * string @@ -266,6 +271,10 @@ EXPORT_SYMBOL(fscrypt_fname_free_buffer); * * The caller must have allocated sufficient memory for the @oname string. * + * If the key is available, we'll decrypt the disk name; otherwise, we'll encode + * it for presentation. Short names are directly base64-encoded, while long + * names are encoded in fscrypt_digested_name format. + * * Return: 0 on success, -errno on failure */ int fscrypt_fname_disk_to_usr(struct inode *inode, @@ -274,7 +283,7 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, struct fscrypt_str *oname) { const struct qstr qname = FSTR_TO_QSTR(iname); - char buf[24]; + struct fscrypt_digested_name digested_name; if (fscrypt_is_dot_dotdot(&qname)) { oname->name[0] = '.'; @@ -289,20 +298,24 @@ int fscrypt_fname_disk_to_usr(struct inode *inode, if (inode->i_crypt_info) return fname_decrypt(inode, iname, oname); - if (iname->len <= FS_FNAME_CRYPTO_DIGEST_SIZE) { + if (iname->len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) { oname->len = digest_encode(iname->name, iname->len, oname->name); return 0; } if (hash) { - memcpy(buf, &hash, 4); - memcpy(buf + 4, &minor_hash, 4); + digested_name.hash = hash; + digested_name.minor_hash = minor_hash; } else { - memset(buf, 0, 8); + digested_name.hash = 0; + digested_name.minor_hash = 0; } - memcpy(buf + 8, iname->name + iname->len - 16, 16); + memcpy(digested_name.digest, + FSCRYPT_FNAME_DIGEST(iname->name, iname->len), + FSCRYPT_FNAME_DIGEST_SIZE); oname->name[0] = '_'; - oname->len = 1 + digest_encode(buf, 24, oname->name + 1); + oname->len = 1 + digest_encode((const char *)&digested_name, + sizeof(digested_name), oname->name + 1); return 0; } EXPORT_SYMBOL(fscrypt_fname_disk_to_usr); @@ -332,14 +345,39 @@ int fscrypt_fname_usr_to_disk(struct inode *inode, * in a directory. Consequently, a user space name cannot be mapped to * a disk-space name */ - return -EACCES; + return -ENOKEY; } EXPORT_SYMBOL(fscrypt_fname_usr_to_disk); +/** + * fscrypt_setup_filename() - prepare to search a possibly encrypted directory + * @dir: the directory that will be searched + * @iname: the user-provided filename being searched for + * @lookup: 1 if we're allowed to proceed without the key because it's + * ->lookup() or we're finding the dir_entry for deletion; 0 if we cannot + * proceed without the key because we're going to create the dir_entry. + * @fname: the filename information to be filled in + * + * Given a user-provided filename @iname, this function sets @fname->disk_name + * to the name that would be stored in the on-disk directory entry, if possible. + * If the directory is unencrypted this is simply @iname. Else, if we have the + * directory's encryption key, then @iname is the plaintext, so we encrypt it to + * get the disk_name. + * + * Else, for keyless @lookup operations, @iname is the presented ciphertext, so + * we decode it to get either the ciphertext disk_name (for short names) or the + * fscrypt_digested_name (for long names). Non-@lookup operations will be + * impossible in this case, so we fail them with ENOKEY. + * + * If successful, fscrypt_free_filename() must be called later to clean up. + * + * Return: 0 on success, -errno on failure + */ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, int lookup, struct fscrypt_name *fname) { - int ret = 0, bigname = 0; + int ret; + int digested; memset(fname, 0, sizeof(struct fscrypt_name)); fname->usr_fname = iname; @@ -350,7 +388,7 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, fname->disk_name.len = iname->len; return 0; } - ret = get_crypt_info(dir); + ret = fscrypt_get_encryption_info(dir); if (ret && ret != -EOPNOTSUPP) return ret; @@ -367,31 +405,43 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return 0; } if (!lookup) - return -EACCES; + return -ENOKEY; /* * We don't have the key and we are doing a lookup; decode the * user-supplied name */ - if (iname->name[0] == '_') - bigname = 1; - if ((bigname && (iname->len != 33)) || (!bigname && (iname->len > 43))) - return -ENOENT; + if (iname->name[0] == '_') { + if (iname->len != + 1 + BASE64_CHARS(sizeof(struct fscrypt_digested_name))) + return -ENOENT; + digested = 1; + } else { + if (iname->len > + BASE64_CHARS(FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE)) + return -ENOENT; + digested = 0; + } - fname->crypto_buf.name = kmalloc(32, GFP_KERNEL); + fname->crypto_buf.name = + kmalloc(max_t(size_t, FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE, + sizeof(struct fscrypt_digested_name)), + GFP_KERNEL); if (fname->crypto_buf.name == NULL) return -ENOMEM; - ret = digest_decode(iname->name + bigname, iname->len - bigname, + ret = digest_decode(iname->name + digested, iname->len - digested, fname->crypto_buf.name); if (ret < 0) { ret = -ENOENT; goto errout; } fname->crypto_buf.len = ret; - if (bigname) { - memcpy(&fname->hash, fname->crypto_buf.name, 4); - memcpy(&fname->minor_hash, fname->crypto_buf.name + 4, 4); + if (digested) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + fname->hash = n->hash; + fname->minor_hash = n->minor_hash; } else { fname->disk_name.name = fname->crypto_buf.name; fname->disk_name.len = fname->crypto_buf.len; @@ -403,12 +453,3 @@ int fscrypt_setup_filename(struct inode *dir, const struct qstr *iname, return ret; } EXPORT_SYMBOL(fscrypt_setup_filename); - -void fscrypt_free_filename(struct fscrypt_name *fname) -{ - kfree(fname->crypto_buf.name); - fname->crypto_buf.name = NULL; - fname->usr_fname = NULL; - fname->disk_name.name = NULL; -} -EXPORT_SYMBOL(fscrypt_free_filename); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h new file mode 100644 index 000000000000..79d79755d79b --- /dev/null +++ b/fs/crypto/fscrypt_private.h @@ -0,0 +1,107 @@ +/* + * fscrypt_private.h + * + * Copyright (C) 2015, Google, Inc. + * + * This contains encryption key functions. + * + * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. + */ + +#ifndef _FSCRYPT_PRIVATE_H +#define _FSCRYPT_PRIVATE_H + +#include +#include + +/* Encryption parameters */ +#define FS_IV_SIZE 16 +#define FS_AES_128_ECB_KEY_SIZE 16 +#define FS_AES_128_CBC_KEY_SIZE 16 +#define FS_AES_128_CTS_KEY_SIZE 16 +#define FS_AES_256_GCM_KEY_SIZE 32 +#define FS_AES_256_CBC_KEY_SIZE 32 +#define FS_AES_256_CTS_KEY_SIZE 32 +#define FS_AES_256_XTS_KEY_SIZE 64 + +#define FS_KEY_DERIVATION_NONCE_SIZE 16 + +/** + * Encryption context for inode + * + * Protector format: + * 1 byte: Protector format (1 = this version) + * 1 byte: File contents encryption mode + * 1 byte: File names encryption mode + * 1 byte: Flags + * 8 bytes: Master Key descriptor + * 16 bytes: Encryption Key derivation nonce + */ +struct fscrypt_context { + u8 format; + u8 contents_encryption_mode; + u8 filenames_encryption_mode; + u8 flags; + u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; + u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; +} __packed; + +#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 + +/* + * A pointer to this structure is stored in the file system's in-core + * representation of an inode. + */ +struct fscrypt_info { + u8 ci_data_mode; + u8 ci_filename_mode; + u8 ci_flags; + struct crypto_skcipher *ci_ctfm; + struct crypto_cipher *ci_essiv_tfm; + u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; +}; + +typedef enum { + FS_DECRYPT = 0, + FS_ENCRYPT, +} fscrypt_direction_t; + +#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 +#define FS_CTX_HAS_BOUNCE_BUFFER_FL 0x00000002 + +struct fscrypt_completion_result { + struct completion completion; + int res; +}; + +#define DECLARE_FS_COMPLETION_RESULT(ecr) \ + struct fscrypt_completion_result ecr = { \ + COMPLETION_INITIALIZER_ONSTACK((ecr).completion), 0 } + +/* bio stuffs */ +#define REQ_OP_READ READ +#define REQ_OP_WRITE WRITE +#define bio_op(bio) ((bio)->bi_rw & 1) + +static inline void bio_set_op_attrs(struct bio *bio, unsigned op, + unsigned op_flags) +{ + bio->bi_rw = op | op_flags; +} + +/* crypto.c */ +extern int fscrypt_initialize(unsigned int cop_flags); +extern struct workqueue_struct *fscrypt_read_workqueue; +extern int fscrypt_do_page_crypto(const struct inode *inode, + fscrypt_direction_t rw, u64 lblk_num, + struct page *src_page, + struct page *dest_page, + unsigned int len, unsigned int offs, + gfp_t gfp_flags); +extern struct page *fscrypt_alloc_bounce_page(struct fscrypt_ctx *ctx, + gfp_t gfp_flags); + +/* keyinfo.c */ +extern void __exit fscrypt_essiv_cleanup(void); + +#endif /* _FSCRYPT_PRIVATE_H */ diff --git a/fs/crypto/keyinfo.c b/fs/crypto/keyinfo.c index 67fb6d8876d0..66e0728e9bbe 100644 --- a/fs/crypto/keyinfo.c +++ b/fs/crypto/keyinfo.c @@ -10,7 +10,12 @@ #include #include -#include +#include +#include +#include +#include "fscrypt_private.h" + +static struct crypto_shash *essiv_hash_tfm; static void derive_crypt_complete(struct crypto_async_request *req, int rc) { @@ -27,13 +32,13 @@ static void derive_crypt_complete(struct crypto_async_request *req, int rc) * derive_key_aes() - Derive a key using AES-128-ECB * @deriving_key: Encryption key used for derivation. * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. + * @derived_raw_key: Derived raw key. * * Return: Zero on success; non-zero otherwise. */ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], - u8 source_key[FS_AES_256_XTS_KEY_SIZE], - u8 derived_key[FS_AES_256_XTS_KEY_SIZE]) + const struct fscrypt_key *source_key, + u8 derived_raw_key[FS_MAX_KEY_SIZE]) { int res = 0; struct skcipher_request *req = NULL; @@ -60,10 +65,10 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], if (res < 0) goto out; - sg_init_one(&src_sg, source_key, FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, FS_AES_256_XTS_KEY_SIZE); - skcipher_request_set_crypt(req, &src_sg, &dst_sg, - FS_AES_256_XTS_KEY_SIZE, NULL); + sg_init_one(&src_sg, source_key->raw, source_key->size); + sg_init_one(&dst_sg, derived_raw_key, source_key->size); + skcipher_request_set_crypt(req, &src_sg, &dst_sg, source_key->size, + NULL); res = crypto_skcipher_encrypt(req); if (res == -EINPROGRESS || res == -EBUSY) { wait_for_completion(&ecr.completion); @@ -77,28 +82,25 @@ static int derive_key_aes(u8 deriving_key[FS_AES_128_ECB_KEY_SIZE], static int validate_user_key(struct fscrypt_info *crypt_info, struct fscrypt_context *ctx, u8 *raw_key, - u8 *prefix, int prefix_size) + const char *prefix, int min_keysize) { - u8 *full_key_descriptor; + char *description; struct key *keyring_key; struct fscrypt_key *master_key; const struct user_key_payload *ukp; - int full_key_len = prefix_size + (FS_KEY_DESCRIPTOR_SIZE * 2) + 1; int res; - full_key_descriptor = kmalloc(full_key_len, GFP_NOFS); - if (!full_key_descriptor) + description = kasprintf(GFP_NOFS, "%s%*phN", prefix, + FS_KEY_DESCRIPTOR_SIZE, + ctx->master_key_descriptor); + if (!description) return -ENOMEM; - memcpy(full_key_descriptor, prefix, prefix_size); - sprintf(full_key_descriptor + prefix_size, - "%*phN", FS_KEY_DESCRIPTOR_SIZE, - ctx->master_key_descriptor); - full_key_descriptor[full_key_len - 1] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - kfree(full_key_descriptor); + keyring_key = request_key(&key_type_logon, description, NULL); + kfree(description); if (IS_ERR(keyring_key)) return PTR_ERR(keyring_key); + down_read(&keyring_key->sem); if (keyring_key->type != &key_type_logon) { printk_once(KERN_WARNING @@ -106,66 +108,68 @@ static int validate_user_key(struct fscrypt_info *crypt_info, res = -ENOKEY; goto out; } - down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct fscrypt_key)) { res = -EINVAL; - up_read(&keyring_key->sem); goto out; } master_key = (struct fscrypt_key *)ukp->data; BUILD_BUG_ON(FS_AES_128_ECB_KEY_SIZE != FS_KEY_DERIVATION_NONCE_SIZE); - if (master_key->size != FS_AES_256_XTS_KEY_SIZE) { + if (master_key->size < min_keysize || master_key->size > FS_MAX_KEY_SIZE + || master_key->size % AES_BLOCK_SIZE != 0) { printk_once(KERN_WARNING "%s: key size incorrect: %d\n", __func__, master_key->size); res = -ENOKEY; - up_read(&keyring_key->sem); goto out; } - res = derive_key_aes(ctx->nonce, master_key->raw, raw_key); - up_read(&keyring_key->sem); - if (res) - goto out; - - crypt_info->ci_keyring_key = keyring_key; - return 0; + res = derive_key_aes(ctx->nonce, master_key, raw_key); out: + up_read(&keyring_key->sem); key_put(keyring_key); return res; } +static const struct { + const char *cipher_str; + int keysize; +} available_modes[] = { + [FS_ENCRYPTION_MODE_AES_256_XTS] = { "xts(aes)", + FS_AES_256_XTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_256_CTS] = { "cts(cbc(aes))", + FS_AES_256_CTS_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CBC] = { "cbc(aes)", + FS_AES_128_CBC_KEY_SIZE }, + [FS_ENCRYPTION_MODE_AES_128_CTS] = { "cts(cbc(aes))", + FS_AES_128_CTS_KEY_SIZE }, +}; + static int determine_cipher_type(struct fscrypt_info *ci, struct inode *inode, const char **cipher_str_ret, int *keysize_ret) { + u32 mode; + + if (!fscrypt_valid_enc_modes(ci->ci_data_mode, ci->ci_filename_mode)) { + pr_warn_ratelimited("fscrypt: inode %lu uses unsupported encryption modes (contents mode %d, filenames mode %d)\n", + inode->i_ino, + ci->ci_data_mode, ci->ci_filename_mode); + return -EINVAL; + } + if (S_ISREG(inode->i_mode)) { - if (ci->ci_data_mode == FS_ENCRYPTION_MODE_AES_256_XTS) { - *cipher_str_ret = "xts(aes)"; - *keysize_ret = FS_AES_256_XTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported contents encryption mode " - "%d for inode %lu\n", - ci->ci_data_mode, inode->i_ino); - return -ENOKEY; + mode = ci->ci_data_mode; + } else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { + mode = ci->ci_filename_mode; + } else { + WARN_ONCE(1, "fscrypt: filesystem tried to load encryption info for inode %lu, which is not encryptable (file type %d)\n", + inode->i_ino, (inode->i_mode & S_IFMT)); + return -EINVAL; } - if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) { - if (ci->ci_filename_mode == FS_ENCRYPTION_MODE_AES_256_CTS) { - *cipher_str_ret = "cts(cbc(aes))"; - *keysize_ret = FS_AES_256_CTS_KEY_SIZE; - return 0; - } - pr_warn_once("fscrypto: unsupported filenames encryption mode " - "%d for inode %lu\n", - ci->ci_filename_mode, inode->i_ino); - return -ENOKEY; - } - - pr_warn_once("fscrypto: unsupported file type %d for inode %lu\n", - (inode->i_mode & S_IFMT), inode->i_ino); - return -ENOKEY; + *cipher_str_ret = available_modes[mode].cipher_str; + *keysize_ret = available_modes[mode].keysize; + return 0; } static void put_crypt_info(struct fscrypt_info *ci) @@ -173,12 +177,78 @@ static void put_crypt_info(struct fscrypt_info *ci) if (!ci) return; - key_put(ci->ci_keyring_key); crypto_free_skcipher(ci->ci_ctfm); + crypto_free_cipher(ci->ci_essiv_tfm); kmem_cache_free(fscrypt_info_cachep, ci); } -int get_crypt_info(struct inode *inode) +static int derive_essiv_salt(const u8 *key, int keysize, u8 *salt) +{ + struct crypto_shash *tfm = READ_ONCE(essiv_hash_tfm); + + /* init hash transform on demand */ + if (unlikely(!tfm)) { + struct crypto_shash *prev_tfm; + + tfm = crypto_alloc_shash("sha256", 0, 0); + if (IS_ERR(tfm)) { + pr_warn_ratelimited("fscrypt: error allocating SHA-256 transform: %ld\n", + PTR_ERR(tfm)); + return PTR_ERR(tfm); + } + prev_tfm = cmpxchg(&essiv_hash_tfm, NULL, tfm); + if (prev_tfm) { + crypto_free_shash(tfm); + tfm = prev_tfm; + } + } + + { + SHASH_DESC_ON_STACK(desc, tfm); + desc->tfm = tfm; + desc->flags = 0; + + return crypto_shash_digest(desc, key, keysize, salt); + } +} + +static int init_essiv_generator(struct fscrypt_info *ci, const u8 *raw_key, + int keysize) +{ + int err; + struct crypto_cipher *essiv_tfm; + u8 salt[SHA256_DIGEST_SIZE]; + + essiv_tfm = crypto_alloc_cipher("aes", 0, 0); + if (IS_ERR(essiv_tfm)) + return PTR_ERR(essiv_tfm); + + ci->ci_essiv_tfm = essiv_tfm; + + err = derive_essiv_salt(raw_key, keysize, salt); + if (err) + goto out; + + /* + * Using SHA256 to derive the salt/key will result in AES-256 being + * used for IV generation. File contents encryption will still use the + * configured keysize (AES-128) nevertheless. + */ + err = crypto_cipher_setkey(essiv_tfm, salt, sizeof(salt)); + if (err) + goto out; + +out: + memzero_explicit(salt, sizeof(salt)); + return err; +} + +void __exit fscrypt_essiv_cleanup(void) +{ + crypto_free_shash(essiv_hash_tfm); +} + +int fscrypt_get_encryption_info(struct inode *inode) { struct fscrypt_info *crypt_info; struct fscrypt_context ctx; @@ -188,30 +258,24 @@ int get_crypt_info(struct inode *inode) u8 *raw_key = NULL; int res; - res = fscrypt_initialize(); + if (inode->i_crypt_info) + return 0; + + res = fscrypt_initialize(inode->i_sb->s_cop->flags); if (res) return res; - if (!inode->i_sb->s_cop->get_context) - return -EOPNOTSUPP; -retry: - crypt_info = ACCESS_ONCE(inode->i_crypt_info); - if (crypt_info) { - if (!crypt_info->ci_keyring_key || - key_validate(crypt_info->ci_keyring_key) == 0) - return 0; - fscrypt_put_encryption_info(inode, crypt_info); - goto retry; - } - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); if (res < 0) { - if (!fscrypt_dummy_context_enabled(inode)) + if (!fscrypt_dummy_context_enabled(inode) || + inode->i_sb->s_cop->is_encrypted(inode)) return res; + /* Fake up a context for an unencrypted directory */ + memset(&ctx, 0, sizeof(ctx)); ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; + memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); } else if (res != sizeof(ctx)) { return -EINVAL; } @@ -230,7 +294,7 @@ int get_crypt_info(struct inode *inode) crypt_info->ci_data_mode = ctx.contents_encryption_mode; crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; crypt_info->ci_ctfm = NULL; - crypt_info->ci_keyring_key = NULL; + crypt_info->ci_essiv_tfm = NULL; memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, sizeof(crypt_info->ci_master_key)); @@ -247,20 +311,12 @@ int get_crypt_info(struct inode *inode) if (!raw_key) goto out; - if (fscrypt_dummy_context_enabled(inode)) { - memset(raw_key, 0x42, FS_AES_256_XTS_KEY_SIZE); - goto got_key; - } - - res = validate_user_key(crypt_info, &ctx, raw_key, - FS_KEY_DESC_PREFIX, FS_KEY_DESC_PREFIX_SIZE); + res = validate_user_key(crypt_info, &ctx, raw_key, FS_KEY_DESC_PREFIX, + keysize); if (res && inode->i_sb->s_cop->key_prefix) { - u8 *prefix = NULL; - int prefix_size, res2; - - prefix_size = inode->i_sb->s_cop->key_prefix(inode, &prefix); - res2 = validate_user_key(crypt_info, &ctx, raw_key, - prefix, prefix_size); + int res2 = validate_user_key(crypt_info, &ctx, raw_key, + inode->i_sb->s_cop->key_prefix, + keysize); if (res2) { if (res2 == -ENOKEY) res = -ENOKEY; @@ -269,30 +325,35 @@ int get_crypt_info(struct inode *inode) } else if (res) { goto out; } -got_key: ctfm = crypto_alloc_skcipher(cipher_str, 0, 0); if (!ctfm || IS_ERR(ctfm)) { res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); + pr_debug("%s: error %d (inode %lu) allocating crypto tfm\n", + __func__, res, inode->i_ino); goto out; } crypt_info->ci_ctfm = ctfm; crypto_skcipher_clear_flags(ctfm, ~0); crypto_skcipher_set_flags(ctfm, CRYPTO_TFM_REQ_WEAK_KEY); + /* + * if the provided key is longer than keysize, we use the first + * keysize bytes of the derived key only + */ res = crypto_skcipher_setkey(ctfm, raw_key, keysize); if (res) goto out; - kzfree(raw_key); - raw_key = NULL; - if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) != NULL) { - put_crypt_info(crypt_info); - goto retry; + if (S_ISREG(inode->i_mode) && + crypt_info->ci_data_mode == FS_ENCRYPTION_MODE_AES_128_CBC) { + res = init_essiv_generator(crypt_info, raw_key, keysize); + if (res) { + pr_debug("%s: error %d (inode %lu) allocating essiv tfm\n", + __func__, res, inode->i_ino); + goto out; + } } - return 0; - + if (cmpxchg(&inode->i_crypt_info, NULL, crypt_info) == NULL) + crypt_info = NULL; out: if (res == -ENOKEY) res = 0; @@ -300,6 +361,7 @@ int get_crypt_info(struct inode *inode) kzfree(raw_key); return res; } +EXPORT_SYMBOL(fscrypt_get_encryption_info); void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) { @@ -317,17 +379,3 @@ void fscrypt_put_encryption_info(struct inode *inode, struct fscrypt_info *ci) put_crypt_info(ci); } EXPORT_SYMBOL(fscrypt_put_encryption_info); - -int fscrypt_get_encryption_info(struct inode *inode) -{ - struct fscrypt_info *ci = inode->i_crypt_info; - - if (!ci || - (ci->ci_keyring_key && - (ci->ci_keyring_key->flags & ((1 << KEY_FLAG_INVALIDATED) | - (1 << KEY_FLAG_REVOKED) | - (1 << KEY_FLAG_DEAD))))) - return get_crypt_info(inode); - return 0; -} -EXPORT_SYMBOL(fscrypt_get_encryption_info); diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c index 6865663aac69..9914d51dff86 100644 --- a/fs/crypto/policy.c +++ b/fs/crypto/policy.c @@ -10,76 +10,37 @@ #include #include -#include #include - -static int inode_has_encryption_context(struct inode *inode) -{ - if (!inode->i_sb->s_cop->get_context) - return 0; - return (inode->i_sb->s_cop->get_context(inode, NULL, 0L) > 0); -} +#include "fscrypt_private.h" /* - * check whether the policy is consistent with the encryption context - * for the inode + * check whether an encryption policy is consistent with an encryption context */ -static int is_encryption_context_consistent_with_policy(struct inode *inode, +static bool is_encryption_context_consistent_with_policy( + const struct fscrypt_context *ctx, const struct fscrypt_policy *policy) { - struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->get_context) - return 0; - - res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); + return memcmp(ctx->master_key_descriptor, policy->master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (ctx->flags == policy->flags) && + (ctx->contents_encryption_mode == + policy->contents_encryption_mode) && + (ctx->filenames_encryption_mode == + policy->filenames_encryption_mode); } static int create_encryption_context_from_policy(struct inode *inode, const struct fscrypt_policy *policy) { struct fscrypt_context ctx; - int res; - - if (!inode->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - - if (inode->i_sb->s_cop->prepare_context) { - res = inode->i_sb->s_cop->prepare_context(inode); - if (res) - return res; - } ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); - if (!fscrypt_valid_contents_enc_mode( - policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); + if (!fscrypt_valid_enc_modes(policy->contents_encryption_mode, + policy->filenames_encryption_mode)) return -EINVAL; - } - - if (!fscrypt_valid_filenames_enc_mode( - policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } if (policy->flags & ~FS_POLICY_FLAGS_VALID) return -EINVAL; @@ -93,16 +54,20 @@ static int create_encryption_context_from_policy(struct inode *inode, return inode->i_sb->s_cop->set_context(inode, &ctx, sizeof(ctx), NULL); } -int fscrypt_process_policy(struct file *filp, - const struct fscrypt_policy *policy) +int fscrypt_ioctl_set_policy(struct file *filp, const void __user *arg) { + struct fscrypt_policy policy; struct inode *inode = file_inode(filp); int ret; + struct fscrypt_context ctx; + + if (copy_from_user(&policy, arg, sizeof(policy))) + return -EFAULT; if (!inode_owner_or_capable(inode)) return -EACCES; - if (policy->version != 0) + if (policy.version != 0) return -EINVAL; ret = mnt_want_write_file(filp); @@ -111,22 +76,23 @@ int fscrypt_process_policy(struct file *filp, inode_lock(inode); - if (!inode_has_encryption_context(inode)) { + ret = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (ret == -ENODATA) { if (!S_ISDIR(inode->i_mode)) - ret = -EINVAL; - else if (!inode->i_sb->s_cop->empty_dir) - ret = -EOPNOTSUPP; + ret = -ENOTDIR; else if (!inode->i_sb->s_cop->empty_dir(inode)) ret = -ENOTEMPTY; else ret = create_encryption_context_from_policy(inode, - policy); - } else if (!is_encryption_context_consistent_with_policy(inode, - policy)) { - printk(KERN_WARNING - "%s: Policy inconsistent with encryption context\n", - __func__); - ret = -EINVAL; + &policy); + } else if (ret == sizeof(ctx) && + is_encryption_context_consistent_with_policy(&ctx, + &policy)) { + /* The file already uses the same encryption policy. */ + ret = 0; + } else if (ret >= 0 || ret == -ERANGE) { + /* The file already uses a different encryption policy. */ + ret = -EEXIST; } inode_unlock(inode); @@ -134,49 +100,94 @@ int fscrypt_process_policy(struct file *filp, mnt_drop_write_file(filp); return ret; } -EXPORT_SYMBOL(fscrypt_process_policy); +EXPORT_SYMBOL(fscrypt_ioctl_set_policy); -int fscrypt_get_policy(struct inode *inode, struct fscrypt_policy *policy) +int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) { + struct inode *inode = file_inode(filp); struct fscrypt_context ctx; + struct fscrypt_policy policy; int res; - if (!inode->i_sb->s_cop->get_context || - !inode->i_sb->s_cop->is_encrypted(inode)) + if (!inode->i_sb->s_cop->is_encrypted(inode)) return -ENODATA; res = inode->i_sb->s_cop->get_context(inode, &ctx, sizeof(ctx)); + if (res < 0 && res != -ERANGE) + return res; if (res != sizeof(ctx)) - return -ENODATA; + return -EINVAL; if (ctx.format != FS_ENCRYPTION_CONTEXT_FORMAT_V1) return -EINVAL; - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, + policy.version = 0; + policy.contents_encryption_mode = ctx.contents_encryption_mode; + policy.filenames_encryption_mode = ctx.filenames_encryption_mode; + policy.flags = ctx.flags; + memcpy(policy.master_key_descriptor, ctx.master_key_descriptor, FS_KEY_DESCRIPTOR_SIZE); + + if (copy_to_user(arg, &policy, sizeof(policy))) + return -EFAULT; return 0; } -EXPORT_SYMBOL(fscrypt_get_policy); +EXPORT_SYMBOL(fscrypt_ioctl_get_policy); +/** + * fscrypt_has_permitted_context() - is a file's encryption policy permitted + * within its directory? + * + * @parent: inode for parent directory + * @child: inode for file being looked up, opened, or linked into @parent + * + * Filesystems must call this before permitting access to an inode in a + * situation where the parent directory is encrypted (either before allowing + * ->lookup() to succeed, or for a regular file before allowing it to be opened) + * and before any operation that involves linking an inode into an encrypted + * directory, including link, rename, and cross rename. It enforces the + * constraint that within a given encrypted directory tree, all files use the + * same encryption policy. The pre-access check is needed to detect potentially + * malicious offline violations of this constraint, while the link and rename + * checks are needed to prevent online violations of this constraint. + * + * Return: 1 if permitted, 0 if forbidden. If forbidden, the caller must fail + * the filesystem operation with EPERM. + */ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) { - struct fscrypt_info *parent_ci, *child_ci; + const struct fscrypt_operations *cops = parent->i_sb->s_cop; + const struct fscrypt_info *parent_ci, *child_ci; + struct fscrypt_context parent_ctx, child_ctx; int res; - if ((parent == NULL) || (child == NULL)) { - printk(KERN_ERR "parent %p child %p\n", parent, child); - BUG_ON(1); - } - - /* no restrictions if the parent directory is not encrypted */ - if (!parent->i_sb->s_cop->is_encrypted(parent)) + /* No restrictions on file types which are never encrypted */ + if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && + !S_ISLNK(child->i_mode)) return 1; - /* if the child directory is not encrypted, this is always a problem */ - if (!parent->i_sb->s_cop->is_encrypted(child)) + + /* No restrictions if the parent directory is unencrypted */ + if (!cops->is_encrypted(parent)) + return 1; + + /* Encrypted directories must not contain unencrypted files */ + if (!cops->is_encrypted(child)) return 0; + + /* + * Both parent and child are encrypted, so verify they use the same + * encryption policy. Compare the fscrypt_info structs if the keys are + * available, otherwise retrieve and compare the fscrypt_contexts. + * + * Note that the fscrypt_context retrieval will be required frequently + * when accessing an encrypted directory tree without the key. + * Performance-wise this is not a big deal because we already don't + * really optimize for file access without the key (to the extent that + * such access is even possible), given that any attempted access + * already causes a fscrypt_context retrieval and keyring search. + * + * In any case, if an unexpected error occurs, fall back to "forbidden". + */ + res = fscrypt_get_encryption_info(parent); if (res) return 0; @@ -185,17 +196,32 @@ int fscrypt_has_permitted_context(struct inode *parent, struct inode *child) return 0; parent_ci = parent->i_crypt_info; child_ci = child->i_crypt_info; - if (!parent_ci && !child_ci) - return 1; - if (!parent_ci || !child_ci) + + if (parent_ci && child_ci) { + return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ci->ci_data_mode == child_ci->ci_data_mode) && + (parent_ci->ci_filename_mode == + child_ci->ci_filename_mode) && + (parent_ci->ci_flags == child_ci->ci_flags); + } + + res = cops->get_context(parent, &parent_ctx, sizeof(parent_ctx)); + if (res != sizeof(parent_ctx)) return 0; - return (memcmp(parent_ci->ci_master_key, - child_ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags)); + res = cops->get_context(child, &child_ctx, sizeof(child_ctx)); + if (res != sizeof(child_ctx)) + return 0; + + return memcmp(parent_ctx.master_key_descriptor, + child_ctx.master_key_descriptor, + FS_KEY_DESCRIPTOR_SIZE) == 0 && + (parent_ctx.contents_encryption_mode == + child_ctx.contents_encryption_mode) && + (parent_ctx.filenames_encryption_mode == + child_ctx.filenames_encryption_mode) && + (parent_ctx.flags == child_ctx.flags); } EXPORT_SYMBOL(fscrypt_has_permitted_context); @@ -204,9 +230,9 @@ EXPORT_SYMBOL(fscrypt_has_permitted_context); * @parent: Parent inode from which the context is inherited. * @child: Child inode that inherits the context from @parent. * @fs_data: private data given by FS. - * @preload: preload child i_crypt_info + * @preload: preload child i_crypt_info if true * - * Return: Zero on success, non-zero otherwise + * Return: 0 on success, -errno on failure */ int fscrypt_inherit_context(struct inode *parent, struct inode *child, void *fs_data, bool preload) @@ -215,9 +241,6 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, struct fscrypt_info *ci; int res; - if (!parent->i_sb->s_cop->set_context) - return -EOPNOTSUPP; - res = fscrypt_get_encryption_info(parent); if (res < 0) return res; @@ -227,19 +250,11 @@ int fscrypt_inherit_context(struct inode *parent, struct inode *child, return -ENOKEY; ctx.format = FS_ENCRYPTION_CONTEXT_FORMAT_V1; - if (fscrypt_dummy_context_enabled(parent)) { - ctx.contents_encryption_mode = FS_ENCRYPTION_MODE_AES_256_XTS; - ctx.filenames_encryption_mode = FS_ENCRYPTION_MODE_AES_256_CTS; - ctx.flags = 0; - memset(ctx.master_key_descriptor, 0x42, FS_KEY_DESCRIPTOR_SIZE); - res = 0; - } else { - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - FS_KEY_DESCRIPTOR_SIZE); - } + ctx.contents_encryption_mode = ci->ci_data_mode; + ctx.filenames_encryption_mode = ci->ci_filename_mode; + ctx.flags = ci->ci_flags; + memcpy(ctx.master_key_descriptor, ci->ci_master_key, + FS_KEY_DESCRIPTOR_SIZE); get_random_bytes(ctx.nonce, FS_KEY_DERIVATION_NONCE_SIZE); res = parent->i_sb->s_cop->set_context(child, &ctx, sizeof(ctx), fs_data); diff --git a/fs/f2fs/Makefile b/fs/f2fs/Makefile index ca949ea7c02f..a0dc559b1b47 100644 --- a/fs/f2fs/Makefile +++ b/fs/f2fs/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_F2FS_FS) += f2fs.o f2fs-y := dir.o file.o inode.o namei.o hash.o super.o inline.o f2fs-y += checkpoint.o gc.o data.o node.o segment.o recovery.o -f2fs-y += shrinker.o extent_cache.o +f2fs-y += shrinker.o extent_cache.o sysfs.o f2fs-$(CONFIG_F2FS_STAT_FS) += debug.o f2fs-$(CONFIG_F2FS_FS_XATTR) += xattr.o f2fs-$(CONFIG_F2FS_FS_POSIX_ACL) += acl.o diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index a45d1f4b7b0f..112f8e04c549 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -210,15 +210,16 @@ static int __f2fs_set_acl(struct inode *inode, int type, void *value = NULL; size_t size = 0; int error; + umode_t mode = inode->i_mode; switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl) { - error = posix_acl_update_mode(inode, &inode->i_mode, &acl); + if (acl && !ipage) { + error = posix_acl_update_mode(inode, &mode, &acl); if (error) return error; - set_acl_inode(inode, inode->i_mode); + set_acl_inode(inode, mode); } break; @@ -236,7 +237,7 @@ static int __f2fs_set_acl(struct inode *inode, int type, value = f2fs_acl_to_disk(F2FS_I_SB(inode), acl, &size); if (IS_ERR(value)) { clear_inode_flag(inode, FI_ACL_MODE); - return (int)PTR_ERR(value); + return PTR_ERR(value); } } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index 45ef3b6bfb04..e86f67ac96c6 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -31,7 +31,7 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io) set_ckpt_flags(sbi, CP_ERROR_FLAG); sbi->sb->s_flags |= MS_RDONLY; if (!end_io) - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); } /* @@ -163,6 +163,7 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, .op_flags = sync ? (REQ_SYNC | REQ_META | REQ_PRIO) : REQ_RAHEAD, .encrypted_page = NULL, + .in_list = false, }; struct blk_plug plug; @@ -208,12 +209,10 @@ int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, } fio.page = page; - fio.old_blkaddr = fio.new_blkaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_bio(&fio); f2fs_put_page(page, 0); } out: - f2fs_submit_merged_bio(sbi, META, READ); blk_finish_plug(&plug); return blkno - start; } @@ -232,8 +231,9 @@ void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index) ra_meta_pages(sbi, index, BIO_MAX_PAGES, META_POR, true); } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) +static int __f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc, + enum iostat_type io_type) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); @@ -246,16 +246,17 @@ static int f2fs_write_meta_page(struct page *page, if (unlikely(f2fs_cp_error(sbi))) goto redirty_out; - write_meta_page(sbi, page); + write_meta_page(sbi, page, io_type); dec_page_count(sbi, F2FS_DIRTY_META); if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, META, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, META); unlock_page(page); if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, META, WRITE); + f2fs_submit_merged_write(sbi, META); return 0; @@ -264,23 +265,33 @@ static int f2fs_write_meta_page(struct page *page, return AOP_WRITEPAGE_ACTIVATE; } +static int f2fs_write_meta_page(struct page *page, + struct writeback_control *wbc) +{ + return __f2fs_write_meta_page(page, wbc, FS_META_IO); +} + static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { struct f2fs_sb_info *sbi = F2FS_M_SB(mapping); long diff, written; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* collect a number of dirty meta pages and write together */ if (wbc->for_kupdate || get_pages(sbi, F2FS_DIRTY_META) < nr_pages_to_skip(sbi, META)) goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, META); + /* if locked failed, cp will flush dirty pages instead */ + if (!mutex_trylock(&sbi->cp_mutex)) + goto skip_write; - /* if mounting is failed, skip writing node pages */ - mutex_lock(&sbi->cp_mutex); + trace_f2fs_writepages(mapping->host, wbc, META); diff = nr_pages_to_write(sbi, META, wbc); - written = sync_meta_pages(sbi, META, wbc->nr_to_write); + written = sync_meta_pages(sbi, META, wbc->nr_to_write, FS_META_IO); mutex_unlock(&sbi->cp_mutex); wbc->nr_to_write = max((long)0, wbc->nr_to_write - written - diff); return 0; @@ -292,7 +303,7 @@ static int f2fs_write_meta_pages(struct address_space *mapping, } long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, - long nr_to_write) + long nr_to_write, enum iostat_type io_type) { struct address_space *mapping = META_MAPPING(sbi); pgoff_t index = 0, end = ULONG_MAX, prev = ULONG_MAX; @@ -343,7 +354,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - if (mapping->a_ops->writepage(page, &wbc)) { + if (__f2fs_write_meta_page(page, &wbc, io_type)) { unlock_page(page); break; } @@ -357,7 +368,7 @@ long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, } stop: if (nwritten) - f2fs_submit_merged_bio(sbi, type, WRITE); + f2fs_submit_merged_write(sbi, type); blk_finish_plug(&plug); @@ -494,6 +505,7 @@ int acquire_orphan_inode(struct f2fs_sb_info *sbi) #ifdef CONFIG_F2FS_FAULT_INJECTION if (time_to_inject(sbi, FAULT_ORPHAN)) { spin_unlock(&im->ino_lock); + f2fs_show_injection_info(FAULT_ORPHAN); return -ENOSPC; } #endif @@ -566,7 +578,7 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) if (ni.blk_addr != NULL_ADDR) { set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_msg(sbi->sb, KERN_WARNING, - "%s: orphan failed (ino=%x), run fsck to fix.", + "%s: orphan failed (ino=%x) by kernel, retry mount.", __func__, ino); return -EIO; } @@ -577,11 +589,24 @@ static int recover_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino) int recover_orphan_inodes(struct f2fs_sb_info *sbi) { block_t start_blk, orphan_blocks, i, j; - int err; + unsigned int s_flags = sbi->sb->s_flags; + int err = 0; if (!is_set_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG)) return 0; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + start_blk = __start_cp_addr(sbi) + 1 + __cp_payload(sbi); orphan_blocks = __start_sum_addr(sbi) - 1 - __cp_payload(sbi); @@ -597,14 +622,21 @@ int recover_orphan_inodes(struct f2fs_sb_info *sbi) err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); - return err; + goto out; } } f2fs_put_page(page, 1); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); - return 0; +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + + return err; } static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) @@ -676,14 +708,13 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); - if (crc_offset >= blk_size) { + if (crc_offset > (blk_size - sizeof(__le32))) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } - crc = le32_to_cpu(*((__le32 *)((unsigned char *)*cp_block - + crc_offset))); + crc = cur_cp_crc(*cp_block); if (!f2fs_crc_valid(sbi, crc, *cp_block, crc_offset)) { f2fs_msg(sbi->sb, KERN_WARNING, "invalid crc value"); return -EINVAL; @@ -816,7 +847,9 @@ static void __add_dirty_inode(struct inode *inode, enum inode_type type) return; set_inode_flag(inode, flag); - list_add_tail(&F2FS_I(inode)->dirty_list, &sbi->inode_list[type]); + if (!f2fs_is_volatile_file(inode)) + list_add_tail(&F2FS_I(inode)->dirty_list, + &sbi->inode_list[type]); stat_inc_dirty_inode(sbi, type); } @@ -874,6 +907,7 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) struct inode *inode; struct f2fs_inode_info *fi; bool is_dir = (type == DIR_INODE); + unsigned long ino = 0; trace_f2fs_sync_dirty_inodes_enter(sbi->sb, is_dir, get_pages(sbi, is_dir ? @@ -896,14 +930,30 @@ int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type) inode = igrab(&fi->vfs_inode); spin_unlock(&sbi->inode_lock[type]); if (inode) { + unsigned long cur_ino = inode->i_ino; + + if (is_dir) + F2FS_I(inode)->cp_task = current; + filemap_fdatawrite(inode->i_mapping); + + if (is_dir) + F2FS_I(inode)->cp_task = NULL; + iput(inode); + /* We need to give cpu to another writers. */ + if (ino == cur_ino) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + } else { + ino = cur_ino; + } } else { /* * We should submit bio, since it exists several * wribacking dentry pages in the freeing inode. */ - f2fs_submit_merged_bio(sbi, DATA, WRITE); + f2fs_submit_merged_write(sbi, DATA); cond_resched(); } goto retry; @@ -941,6 +991,19 @@ int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi) return 0; } +static void __prepare_cp_block(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + nid_t last_nid = nm_i->next_scan_nid; + + next_free_nid(sbi, &last_nid); + ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); + ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); + ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); + ckpt->next_free_nid = cpu_to_le32(last_nid); +} + /* * Freeze all the FS-operations for checkpoint. */ @@ -964,33 +1027,47 @@ static int block_operations(struct f2fs_sb_info *sbi) err = sync_dirty_inodes(sbi, DIR_INODE); if (err) goto out; - goto retry_flush_dents; - } - - if (get_pages(sbi, F2FS_DIRTY_IMETA)) { - f2fs_unlock_all(sbi); - err = f2fs_sync_inode_meta(sbi); - if (err) - goto out; + cond_resched(); goto retry_flush_dents; } /* * POR: we should ensure that there are no dirty node pages - * until finishing nat/sit flush. + * until finishing nat/sit flush. inode->i_blocks can be updated. */ + down_write(&sbi->node_change); + + if (get_pages(sbi, F2FS_DIRTY_IMETA)) { + up_write(&sbi->node_change); + f2fs_unlock_all(sbi); + err = f2fs_sync_inode_meta(sbi); + if (err) + goto out; + cond_resched(); + goto retry_flush_dents; + } + retry_flush_nodes: down_write(&sbi->node_write); if (get_pages(sbi, F2FS_DIRTY_NODES)) { up_write(&sbi->node_write); - err = sync_node_pages(sbi, &wbc); + err = sync_node_pages(sbi, &wbc, false, FS_CP_NODE_IO); if (err) { + up_write(&sbi->node_change); f2fs_unlock_all(sbi); goto out; } + cond_resched(); goto retry_flush_nodes; } + + /* + * sbi->node_change is used only for AIO write_begin path which produces + * dirty node blocks and some checkpoint values by block allocation. + */ + __prepare_cp_block(sbi); + up_write(&sbi->node_change); out: blk_finish_plug(&plug); return err; @@ -999,8 +1076,6 @@ static int block_operations(struct f2fs_sb_info *sbi) static void unblock_operations(struct f2fs_sb_info *sbi) { up_write(&sbi->node_write); - - build_free_nids(sbi, false); f2fs_unlock_all(sbi); } @@ -1023,15 +1098,24 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) { unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + unsigned long flags; - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); - if (cpc->reason == CP_UMOUNT) + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + + if (cpc->reason & CP_TRIMMED) + __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); + + if (cpc->reason & CP_UMOUNT) __set_ckpt_flags(ckpt, CP_UMOUNT_FLAG); else __clear_ckpt_flags(ckpt, CP_UMOUNT_FLAG); - if (cpc->reason == CP_FASTBOOT) + if (cpc->reason & CP_FASTBOOT) __set_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); else __clear_ckpt_flags(ckpt, CP_FASTBOOT_FLAG); @@ -1047,15 +1131,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* set this flag to activate crc|cp_ver for recovery */ __set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num; - nid_t last_nid = nm_i->next_scan_nid; + unsigned long orphan_num = sbi->im[ORPHAN_INO].ino_num, flags; block_t start_blk; unsigned int data_sum_blocks, orphan_blocks; __u32 crc32 = 0; @@ -1067,19 +1150,16 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) /* Flush all the NAT/SIT pages */ while (get_pages(sbi, F2FS_DIRTY_META)) { - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); if (unlikely(f2fs_cp_error(sbi))) return -EIO; } - next_free_nid(sbi, &last_nid); - /* * modify checkpoint * version number is already updated */ ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi)); - ckpt->valid_block_count = cpu_to_le64(valid_user_blocks(sbi)); ckpt->free_segment_count = cpu_to_le32(free_segments(sbi)); for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { ckpt->cur_node_segno[i] = @@ -1098,18 +1178,14 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) curseg_alloc_type(sbi, i + CURSEG_HOT_DATA); } - ckpt->valid_node_count = cpu_to_le32(valid_node_count(sbi)); - ckpt->valid_inode_count = cpu_to_le32(valid_inode_count(sbi)); - ckpt->next_free_nid = cpu_to_le32(last_nid); - /* 2 cp + n data seg summary + orphan inode blocks */ data_sum_blocks = npages_for_summary_flush(sbi, false); - spin_lock(&sbi->cp_lock); + spin_lock_irqsave(&sbi->cp_lock, flags); if (data_sum_blocks < NR_CURSEG_DATA_TYPE) __set_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); else __clear_ckpt_flags(ckpt, CP_COMPACT_SUM_FLAG); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); orphan_blocks = GET_ORPHAN_BLOCKS(orphan_num); ckpt->cp_pack_start_sum = cpu_to_le32(1 + cp_payload_blks + @@ -1138,6 +1214,27 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); + /* write nat bits */ + if (enabled_nat_bits(sbi, cpc)) { + __u64 cp_ver = cur_cp_version(ckpt); + block_t blk; + + cp_ver |= ((__u64)crc32 << 32); + *(__le64 *)nm_i->nat_bits = cpu_to_le64(cp_ver); + + blk = start_blk + sbi->blocks_per_seg - nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) + update_meta_page(sbi, nm_i->nat_bits + + (i << F2FS_BLKSIZE_BITS), blk + i); + + /* Flush all the NAT BITS pages */ + while (get_pages(sbi, F2FS_DIRTY_META)) { + sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO); + if (unlikely(f2fs_cp_error(sbi))) + return -EIO; + } + } + /* need to wait for end_io results */ wait_on_all_pages_writeback(sbi); if (unlikely(f2fs_cp_error(sbi))) @@ -1187,7 +1284,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) percpu_counter_set(&sbi->alloc_valid_block_count, 0); /* Here, we only have one bio having CP pack */ - sync_meta_pages(sbi, META_FLUSH, LONG_MAX); + sync_meta_pages(sbi, META_FLUSH, LONG_MAX, FS_CP_META_IO); /* wait for previous submitted meta pages writeback */ wait_on_all_pages_writeback(sbi); @@ -1226,8 +1323,8 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) mutex_lock(&sbi->cp_mutex); if (!is_sbi_flag_set(sbi, SBI_IS_DIRTY) && - (cpc->reason == CP_FASTBOOT || cpc->reason == CP_SYNC || - (cpc->reason == CP_DISCARD && !sbi->discard_blks))) + ((cpc->reason & CP_FASTBOOT) || (cpc->reason & CP_SYNC) || + ((cpc->reason & CP_DISCARD) && !sbi->discard_blks))) goto out; if (unlikely(f2fs_cp_error(sbi))) { err = -EIO; @@ -1246,10 +1343,10 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish block_ops"); - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); /* this is the case of multiple fstrims without any changes */ - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { if (!exist_trim_candidates(sbi, cpc)) { unblock_operations(sbi); goto out; @@ -1274,7 +1371,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) ckpt->checkpoint_ver = cpu_to_le64(++ckpt_ver); /* write cached NAT/SIT entries to NAT/SIT area */ - flush_nat_entries(sbi); + flush_nat_entries(sbi, cpc); flush_sit_entries(sbi, cpc); /* unlock all the fs_lock[] in do_checkpoint() */ @@ -1287,7 +1384,7 @@ int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) unblock_operations(sbi); stat_inc_cp_count(sbi->stat_info); - if (cpc->reason == CP_RECOVERY) + if (cpc->reason & CP_RECOVERY) f2fs_msg(sbi->sb, KERN_NOTICE, "checkpoint: version = %llx", ckpt_ver); diff --git a/fs/f2fs/crypto_key.c b/fs/f2fs/crypto_key.c deleted file mode 100644 index 18595d7a0efc..000000000000 --- a/fs/f2fs/crypto_key.c +++ /dev/null @@ -1,240 +0,0 @@ -/* - * linux/fs/f2fs/crypto_key.c - * - * Copied from linux/fs/f2fs/crypto_key.c - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption key functions for f2fs - * - * Written by Michael Halcrow, Ildar Muslukhov, and Uday Savagaonkar, 2015. - */ -#include -#include -#include -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static void derive_crypt_complete(struct crypto_async_request *req, int rc) -{ - struct f2fs_completion_result *ecr = req->data; - - if (rc == -EINPROGRESS) - return; - - ecr->res = rc; - complete(&ecr->completion); -} - -/** - * f2fs_derive_key_aes() - Derive a key using AES-128-ECB - * @deriving_key: Encryption key used for derivatio. - * @source_key: Source key to which to apply derivation. - * @derived_key: Derived key. - * - * Return: Zero on success; non-zero otherwise. - */ -static int f2fs_derive_key_aes(char deriving_key[F2FS_AES_128_ECB_KEY_SIZE], - char source_key[F2FS_AES_256_XTS_KEY_SIZE], - char derived_key[F2FS_AES_256_XTS_KEY_SIZE]) -{ - int res = 0; - struct ablkcipher_request *req = NULL; - DECLARE_F2FS_COMPLETION_RESULT(ecr); - struct scatterlist src_sg, dst_sg; - struct crypto_ablkcipher *tfm = crypto_alloc_ablkcipher("ecb(aes)", 0, - 0); - - if (IS_ERR(tfm)) { - res = PTR_ERR(tfm); - tfm = NULL; - goto out; - } - crypto_ablkcipher_set_flags(tfm, CRYPTO_TFM_REQ_WEAK_KEY); - req = ablkcipher_request_alloc(tfm, GFP_NOFS); - if (!req) { - res = -ENOMEM; - goto out; - } - ablkcipher_request_set_callback(req, - CRYPTO_TFM_REQ_MAY_BACKLOG | CRYPTO_TFM_REQ_MAY_SLEEP, - derive_crypt_complete, &ecr); - res = crypto_ablkcipher_setkey(tfm, deriving_key, - F2FS_AES_128_ECB_KEY_SIZE); - if (res < 0) - goto out; - - sg_init_one(&src_sg, source_key, F2FS_AES_256_XTS_KEY_SIZE); - sg_init_one(&dst_sg, derived_key, F2FS_AES_256_XTS_KEY_SIZE); - ablkcipher_request_set_crypt(req, &src_sg, &dst_sg, - F2FS_AES_256_XTS_KEY_SIZE, NULL); - res = crypto_ablkcipher_encrypt(req); - if (res == -EINPROGRESS || res == -EBUSY) { - BUG_ON(req->base.data != &ecr); - wait_for_completion(&ecr.completion); - res = ecr.res; - } -out: - if (req) - ablkcipher_request_free(req); - if (tfm) - crypto_free_ablkcipher(tfm); - return res; -} - -static void f2fs_free_crypt_info(struct f2fs_crypt_info *ci) -{ - if (!ci) - return; - - crypto_free_ablkcipher(ci->ci_ctfm); - kmem_cache_free(f2fs_crypt_info_cachep, ci); -} - -void f2fs_free_encryption_info(struct inode *inode, struct f2fs_crypt_info *ci) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *prev; - - if (ci == NULL) - ci = ACCESS_ONCE(fi->i_crypt_info); - if (ci == NULL) - return; - prev = cmpxchg(&fi->i_crypt_info, ci, NULL); - if (prev != ci) - return; - - f2fs_free_crypt_info(ci); -} - -int f2fs_get_encryption_info(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - struct f2fs_crypt_info *crypt_info; - char full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (F2FS_KEY_DESCRIPTOR_SIZE * 2) + 1]; - struct key *keyring_key = NULL; - struct f2fs_encryption_key *master_key; - struct f2fs_encryption_context ctx; - const struct user_key_payload *ukp; - struct crypto_ablkcipher *ctfm; - const char *cipher_str; - char raw_key[F2FS_MAX_KEY_SIZE]; - char mode; - int res; - - if (fi->i_crypt_info) - return 0; - - res = f2fs_crypto_initialize(); - if (res) - return res; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res < 0) - return res; - else if (res != sizeof(ctx)) - return -EINVAL; - res = 0; - - crypt_info = kmem_cache_alloc(f2fs_crypt_info_cachep, GFP_NOFS); - if (!crypt_info) - return -ENOMEM; - - crypt_info->ci_flags = ctx.flags; - crypt_info->ci_data_mode = ctx.contents_encryption_mode; - crypt_info->ci_filename_mode = ctx.filenames_encryption_mode; - crypt_info->ci_ctfm = NULL; - memcpy(crypt_info->ci_master_key, ctx.master_key_descriptor, - sizeof(crypt_info->ci_master_key)); - if (S_ISREG(inode->i_mode)) - mode = crypt_info->ci_data_mode; - else if (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) - mode = crypt_info->ci_filename_mode; - else - BUG(); - - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - cipher_str = "xts(aes)"; - break; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - cipher_str = "cts(cbc(aes))"; - break; - default: - printk_once(KERN_WARNING - "f2fs: unsupported key mode %d (ino %u)\n", - mode, (unsigned) inode->i_ino); - res = -ENOKEY; - goto out; - } - - memcpy(full_key_descriptor, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sprintf(full_key_descriptor + F2FS_KEY_DESC_PREFIX_SIZE, - "%*phN", F2FS_KEY_DESCRIPTOR_SIZE, - ctx.master_key_descriptor); - full_key_descriptor[F2FS_KEY_DESC_PREFIX_SIZE + - (2 * F2FS_KEY_DESCRIPTOR_SIZE)] = '\0'; - keyring_key = request_key(&key_type_logon, full_key_descriptor, NULL); - if (IS_ERR(keyring_key)) { - res = PTR_ERR(keyring_key); - keyring_key = NULL; - goto out; - } - BUG_ON(keyring_key->type != &key_type_logon); - ukp = user_key_payload(keyring_key); - if (ukp->datalen != sizeof(struct f2fs_encryption_key)) { - res = -EINVAL; - goto out; - } - master_key = (struct f2fs_encryption_key *)ukp->data; - BUILD_BUG_ON(F2FS_AES_128_ECB_KEY_SIZE != - F2FS_KEY_DERIVATION_NONCE_SIZE); - BUG_ON(master_key->size != F2FS_AES_256_XTS_KEY_SIZE); - res = f2fs_derive_key_aes(ctx.nonce, master_key->raw, - raw_key); - if (res) - goto out; - - ctfm = crypto_alloc_ablkcipher(cipher_str, 0, 0); - if (!ctfm || IS_ERR(ctfm)) { - res = ctfm ? PTR_ERR(ctfm) : -ENOMEM; - printk(KERN_DEBUG - "%s: error %d (inode %u) allocating crypto tfm\n", - __func__, res, (unsigned) inode->i_ino); - goto out; - } - crypt_info->ci_ctfm = ctfm; - crypto_ablkcipher_clear_flags(ctfm, ~0); - crypto_tfm_set_flags(crypto_ablkcipher_tfm(ctfm), - CRYPTO_TFM_REQ_WEAK_KEY); - res = crypto_ablkcipher_setkey(ctfm, raw_key, - f2fs_encryption_key_size(mode)); - if (res) - goto out; - - if (cmpxchg(&fi->i_crypt_info, NULL, crypt_info) == NULL) - crypt_info = NULL; -out: - if (res == -ENOKEY && !S_ISREG(inode->i_mode)) - res = 0; - key_put(keyring_key); - f2fs_free_crypt_info(crypt_info); - memzero_explicit(raw_key, sizeof(raw_key)); - return res; -} - -int f2fs_has_encryption_key(struct inode *inode) -{ - struct f2fs_inode_info *fi = F2FS_I(inode); - - return (fi->i_crypt_info != NULL); -} diff --git a/fs/f2fs/crypto_policy.c b/fs/f2fs/crypto_policy.c deleted file mode 100644 index 884f3f0fe29d..000000000000 --- a/fs/f2fs/crypto_policy.c +++ /dev/null @@ -1,248 +0,0 @@ -/* - * copied from linux/fs/ext4/crypto_policy.c - * - * Copyright (C) 2015, Google, Inc. - * Copyright (C) 2015, Motorola Mobility. - * - * This contains encryption policy functions for f2fs with some modifications - * to support f2fs-specific xattr APIs. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#include -#include -#include -#include - -#include "f2fs.h" -#include "xattr.h" - -static int f2fs_inode_has_encryption_context(struct inode *inode) -{ - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, NULL, 0, NULL); - return (res > 0); -} - -/* - * check whether the policy is consistent with the encryption context - * for the inode - */ -static int f2fs_is_encryption_context_consistent_with_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL); - - if (res != sizeof(ctx)) - return 0; - - return (memcmp(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (ctx.flags == policy->flags) && - (ctx.contents_encryption_mode == - policy->contents_encryption_mode) && - (ctx.filenames_encryption_mode == - policy->filenames_encryption_mode)); -} - -static int f2fs_create_encryption_context_from_policy( - struct inode *inode, const struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - memcpy(ctx.master_key_descriptor, policy->master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - - if (!f2fs_valid_contents_enc_mode(policy->contents_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid contents encryption mode %d\n", __func__, - policy->contents_encryption_mode); - return -EINVAL; - } - - if (!f2fs_valid_filenames_enc_mode(policy->filenames_encryption_mode)) { - printk(KERN_WARNING - "%s: Invalid filenames encryption mode %d\n", __func__, - policy->filenames_encryption_mode); - return -EINVAL; - } - - if (policy->flags & ~F2FS_POLICY_FLAGS_VALID) - return -EINVAL; - - ctx.contents_encryption_mode = policy->contents_encryption_mode; - ctx.filenames_encryption_mode = policy->filenames_encryption_mode; - ctx.flags = policy->flags; - BUILD_BUG_ON(sizeof(ctx.nonce) != F2FS_KEY_DERIVATION_NONCE_SIZE); - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - - return f2fs_setxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), NULL, XATTR_CREATE); -} - -int f2fs_process_policy(const struct f2fs_encryption_policy *policy, - struct inode *inode) -{ - if (!inode_owner_or_capable(inode)) - return -EACCES; - - if (policy->version != 0) - return -EINVAL; - - if (!S_ISDIR(inode->i_mode)) - return -EINVAL; - - if (!f2fs_inode_has_encryption_context(inode)) { - if (!f2fs_empty_dir(inode)) - return -ENOTEMPTY; - return f2fs_create_encryption_context_from_policy(inode, - policy); - } - - if (f2fs_is_encryption_context_consistent_with_policy(inode, policy)) - return 0; - - printk(KERN_WARNING "%s: Policy inconsistent with encryption context\n", - __func__); - return -EINVAL; -} - -int f2fs_get_policy(struct inode *inode, struct f2fs_encryption_policy *policy) -{ - struct f2fs_encryption_context ctx; - int res; - - if (!f2fs_encrypted_inode(inode)) - return -ENODATA; - - res = f2fs_getxattr(inode, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &ctx, sizeof(ctx), NULL); - if (res != sizeof(ctx)) - return -ENODATA; - if (ctx.format != F2FS_ENCRYPTION_CONTEXT_FORMAT_V1) - return -EINVAL; - - policy->version = 0; - policy->contents_encryption_mode = ctx.contents_encryption_mode; - policy->filenames_encryption_mode = ctx.filenames_encryption_mode; - policy->flags = ctx.flags; - memcpy(&policy->master_key_descriptor, ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE); - return 0; -} - -int f2fs_is_child_context_consistent_with_parent(struct inode *parent, - struct inode *child) -{ - const struct f2fs_crypt_info *parent_ci, *child_ci; - struct f2fs_encryption_context parent_ctx, child_ctx; - int res; - - /* No restrictions on file types which are never encrypted */ - if (!S_ISREG(child->i_mode) && !S_ISDIR(child->i_mode) && - !S_ISLNK(child->i_mode)) - return 1; - - /* No restrictions if the parent directory is unencrypted */ - if (!f2fs_encrypted_inode(parent)) - return 1; - - /* Encrypted directories must not contain unencrypted files */ - if (!f2fs_encrypted_inode(child)) - return 0; - - /* - * Both parent and child are encrypted, so verify they use the same - * encryption policy. Compare the fscrypt_info structs if the keys are - * available, otherwise retrieve and compare the fscrypt_contexts. - * - * Note that the fscrypt_context retrieval will be required frequently - * when accessing an encrypted directory tree without the key. - * Performance-wise this is not a big deal because we already don't - * really optimize for file access without the key (to the extent that - * such access is even possible), given that any attempted access - * already causes a fscrypt_context retrieval and keyring search. - * - * In any case, if an unexpected error occurs, fall back to "forbidden". - */ - - res = f2fs_get_encryption_info(parent); - if (res) - return 0; - res = f2fs_get_encryption_info(child); - if (res) - return 0; - parent_ci = F2FS_I(parent)->i_crypt_info; - child_ci = F2FS_I(child)->i_crypt_info; - if (parent_ci && child_ci) { - return memcmp(parent_ci->ci_master_key, child_ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ci->ci_data_mode == child_ci->ci_data_mode) && - (parent_ci->ci_filename_mode == - child_ci->ci_filename_mode) && - (parent_ci->ci_flags == child_ci->ci_flags); - } - - res = f2fs_getxattr(parent, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &parent_ctx, sizeof(parent_ctx), NULL); - if (res != sizeof(parent_ctx)) - return 0; - - res = f2fs_getxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, - &child_ctx, sizeof(child_ctx), NULL); - if (res != sizeof(child_ctx)) - return 0; - - return memcmp(parent_ctx.master_key_descriptor, - child_ctx.master_key_descriptor, - F2FS_KEY_DESCRIPTOR_SIZE) == 0 && - (parent_ctx.contents_encryption_mode == - child_ctx.contents_encryption_mode) && - (parent_ctx.filenames_encryption_mode == - child_ctx.filenames_encryption_mode) && - (parent_ctx.flags == child_ctx.flags); -} - -/** - * f2fs_inherit_context() - Sets a child context from its parent - * @parent: Parent inode from which the context is inherited. - * @child: Child inode that inherits the context from @parent. - * - * Return: Zero on success, non-zero otherwise - */ -int f2fs_inherit_context(struct inode *parent, struct inode *child, - struct page *ipage) -{ - struct f2fs_encryption_context ctx; - struct f2fs_crypt_info *ci; - int res; - - res = f2fs_get_encryption_info(parent); - if (res < 0) - return res; - - ci = F2FS_I(parent)->i_crypt_info; - BUG_ON(ci == NULL); - - ctx.format = F2FS_ENCRYPTION_CONTEXT_FORMAT_V1; - - ctx.contents_encryption_mode = ci->ci_data_mode; - ctx.filenames_encryption_mode = ci->ci_filename_mode; - ctx.flags = ci->ci_flags; - memcpy(ctx.master_key_descriptor, ci->ci_master_key, - F2FS_KEY_DESCRIPTOR_SIZE); - - get_random_bytes(ctx.nonce, F2FS_KEY_DERIVATION_NONCE_SIZE); - return f2fs_setxattr(child, F2FS_XATTR_INDEX_ENCRYPTION, - F2FS_XATTR_NAME_ENCRYPTION_CONTEXT, &ctx, - sizeof(ctx), ipage, XATTR_CREATE); -} diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4ac72a3f920a..c8583d7a1845 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -56,8 +56,10 @@ static void f2fs_read_end_io(struct bio *bio) int i; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) + if (time_to_inject(F2FS_P_SB(bio->bi_io_vec->bv_page), FAULT_IO)) { + f2fs_show_injection_info(FAULT_IO); bio->bi_error = -EIO; + } #endif if (f2fs_bio_encrypted(bio)) { @@ -223,7 +225,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi, trace_f2fs_submit_read_bio(sbi->sb, type, bio); else trace_f2fs_submit_write_bio(sbi->sb, type, bio); - submit_bio(0, bio); + submit_bio(bio_op(bio), bio); } static void __submit_merged_bio(struct f2fs_bio_info *io) @@ -244,8 +246,8 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) io->bio = NULL; } -static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, - struct page *page, nid_t ino) +static bool __has_merged_page(struct f2fs_bio_info *io, + struct inode *inode, nid_t ino, pgoff_t idx) { struct bio_vec *bvec; struct page *target; @@ -254,7 +256,7 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, if (!io->bio) return false; - if (!inode && !page && !ino) + if (!inode && !ino) return true; bio_for_each_segment_all(bvec, io->bio, i) { @@ -264,10 +266,11 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, else target = fscrypt_control_page(bvec->bv_page); + if (idx != target->index) + continue; + if (inode && inode == target->mapping->host) return true; - if (page && page == target) - return true; if (ino && ino == ino_of_node(target)) return true; } @@ -276,70 +279,88 @@ static bool __has_merged_page(struct f2fs_bio_info *io, struct inode *inode, } static bool has_merged_page(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page, nid_t ino, - enum page_type type) + nid_t ino, pgoff_t idx, enum page_type type) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io = &sbi->write_io[btype]; - bool ret; + enum temp_type temp; + struct f2fs_bio_info *io; + bool ret = false; - down_read(&io->io_rwsem); - ret = __has_merged_page(io, inode, page, ino); - up_read(&io->io_rwsem); + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + io = sbi->write_io[btype] + temp; + + down_read(&io->io_rwsem); + ret = __has_merged_page(io, inode, ino, idx); + up_read(&io->io_rwsem); + + /* TODO: use HOT temp only for meta pages now. */ + if (ret || btype == META) + break; + } return ret; } -static void __f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi, + enum page_type type, enum temp_type temp) { enum page_type btype = PAGE_TYPE_OF_BIO(type); - struct f2fs_bio_info *io; - - io = is_read_io(rw) ? &sbi->read_io : &sbi->write_io[btype]; + struct f2fs_bio_info *io = sbi->write_io[btype] + temp; down_write(&io->io_rwsem); - if (!__has_merged_page(io, inode, page, ino)) - goto out; - /* change META to META_FLUSH in the checkpoint procedure */ if (type >= META_FLUSH) { io->fio.type = META_FLUSH; io->fio.op = REQ_OP_WRITE; - io->fio.op_flags = WRITE_FLUSH | REQ_META | REQ_PRIO; + io->fio.op_flags = REQ_META | REQ_PRIO; if (!test_opt(sbi, NOBARRIER)) - io->fio.op_flags |= REQ_FUA; + io->fio.op_flags |= WRITE_FLUSH | REQ_FUA; } __submit_merged_bio(io); -out: up_write(&io->io_rwsem); } -void f2fs_submit_merged_bio(struct f2fs_sb_info *sbi, enum page_type type, - int rw) +static void __submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type, bool force) { - __f2fs_submit_merged_bio(sbi, NULL, NULL, 0, type, rw); + enum temp_type temp; + + if (!force && !has_merged_page(sbi, inode, ino, idx, type)) + return; + + for (temp = HOT; temp < NR_TEMP_TYPE; temp++) { + + __f2fs_submit_merged_write(sbi, type, temp); + + /* TODO: use HOT temp only for meta pages now. */ + if (type >= META) + break; + } } -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *sbi, - struct inode *inode, struct page *page, - nid_t ino, enum page_type type, int rw) +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type) { - if (has_merged_page(sbi, inode, page, ino, type)) - __f2fs_submit_merged_bio(sbi, inode, page, ino, type, rw); + __submit_merged_write_cond(sbi, NULL, 0, 0, type, true); } -void f2fs_flush_merged_bios(struct f2fs_sb_info *sbi) +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type) { - f2fs_submit_merged_bio(sbi, DATA, WRITE); - f2fs_submit_merged_bio(sbi, NODE, WRITE); - f2fs_submit_merged_bio(sbi, META, WRITE); + __submit_merged_write_cond(sbi, inode, ino, idx, type, false); +} + +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi) +{ + f2fs_submit_merged_write(sbi, DATA); + f2fs_submit_merged_write(sbi, NODE); + f2fs_submit_merged_write(sbi, META); } /* * Fill the locked page with data located in the block address. - * Return unlocked page. + * A caller needs to unlock the page on failure. */ int f2fs_submit_page_bio(struct f2fs_io_info *fio) { @@ -360,19 +381,35 @@ int f2fs_submit_page_bio(struct f2fs_io_info *fio) bio_set_op_attrs(bio, fio->op, fio->op_flags); __submit_bio(fio->sbi, bio, fio->type); + + if (!is_read_io(fio->op)) + inc_page_count(fio->sbi, WB_DATA_TYPE(fio->page)); return 0; } -int f2fs_submit_page_mbio(struct f2fs_io_info *fio) +int f2fs_submit_page_write(struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = fio->sbi; enum page_type btype = PAGE_TYPE_OF_BIO(fio->type); - struct f2fs_bio_info *io; - bool is_read = is_read_io(fio->op); + struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp; struct page *bio_page; int err = 0; - io = is_read ? &sbi->read_io : &sbi->write_io[btype]; + f2fs_bug_on(sbi, is_read_io(fio->op)); + + down_write(&io->io_rwsem); +next: + if (fio->in_list) { + spin_lock(&io->io_lock); + if (list_empty(&io->io_list)) { + spin_unlock(&io->io_lock); + goto out_fail; + } + fio = list_first_entry(&io->io_list, + struct f2fs_io_info, list); + list_del(&fio->list); + spin_unlock(&io->io_lock); + } if (fio->old_blkaddr != NEW_ADDR) verify_block_addr(sbi, fio->old_blkaddr); @@ -380,10 +417,10 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page; - if (!is_read) - inc_page_count(sbi, WB_DATA_TYPE(bio_page)); + /* set submitted = 1 as a return value */ + fio->submitted = 1; - down_write(&io->io_rwsem); + inc_page_count(sbi, WB_DATA_TYPE(bio_page)); if (io->bio && (io->last_block_in_bio != fio->new_blkaddr - 1 || (io->fio.op != fio->op || io->fio.op_flags != fio->op_flags) || @@ -398,32 +435,86 @@ int f2fs_submit_page_mbio(struct f2fs_io_info *fio) goto out_fail; } io->bio = __bio_alloc(sbi, fio->new_blkaddr, - BIO_MAX_PAGES, is_read); + BIO_MAX_PAGES, false); io->fio = *fio; } - if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) { __submit_merged_bio(io); goto alloc_new; } io->last_block_in_bio = fio->new_blkaddr; f2fs_trace_ios(fio, 0); + + trace_f2fs_submit_page_write(fio->page, fio); + + if (fio->in_list) + goto next; out_fail: up_write(&io->io_rwsem); - trace_f2fs_submit_page_mbio(fio->page, fio); return err; } +static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, + unsigned nr_pages) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct fscrypt_ctx *ctx = NULL; + struct bio *bio; + + if (f2fs_encrypted_file(inode)) { + ctx = fscrypt_get_ctx(inode, GFP_NOFS); + if (IS_ERR(ctx)) + return ERR_CAST(ctx); + + /* wait the page to be moved by cleaning */ + f2fs_wait_on_block_writeback(sbi, blkaddr); + } + + bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); + if (!bio) { + if (ctx) + fscrypt_release_ctx(ctx); + return ERR_PTR(-ENOMEM); + } + f2fs_target_device(sbi, blkaddr, bio); + bio->bi_end_io = f2fs_read_end_io; + bio->bi_private = ctx; + bio_set_op_attrs(bio, REQ_OP_READ, 0); + + return bio; +} + +/* This can handle encryption stuffs */ +static int f2fs_submit_page_read(struct inode *inode, struct page *page, + block_t blkaddr) +{ + struct bio *bio = f2fs_grab_read_bio(inode, blkaddr, 1); + + if (IS_ERR(bio)) + return PTR_ERR(bio); + + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { + bio_put(bio); + return -EFAULT; + } + __submit_bio(F2FS_I_SB(inode), bio, DATA); + return 0; +} + static void __set_data_blkaddr(struct dnode_of_data *dn) { struct f2fs_node *rn = F2FS_NODE(dn->node_page); __le32 *addr_array; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); /* Get physical address of data block */ addr_array = blkaddr_in_node(rn); - addr_array[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); + addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); } /* @@ -451,14 +542,15 @@ void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); + int err; if (!count) return 0; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); @@ -466,8 +558,8 @@ int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) f2fs_wait_on_page_writeback(dn->node_page, NODE, true); for (; count > 0; dn->ofs_in_node++) { - block_t blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + block_t blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -509,7 +601,7 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index) { - struct extent_info ei; + struct extent_info ei = {0,0,0}; struct inode *inode = dn->inode; if (f2fs_lookup_extent_cache(inode, index, &ei)) { @@ -526,18 +618,8 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; struct page *page; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err; - struct f2fs_io_info fio = { - .sbi = F2FS_I_SB(inode), - .type = DATA, - .op = REQ_OP_READ, - .op_flags = op_flags, - .encrypted_page = NULL, - }; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - return read_mapping_page(mapping, index, NULL); page = f2fs_grab_cache_page(mapping, index, for_write); if (!page) @@ -578,9 +660,7 @@ struct page *get_read_data_page(struct inode *inode, pgoff_t index, return page; } - fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; - fio.page = page; - err = f2fs_submit_page_bio(&fio); + err = f2fs_submit_page_read(inode, page, dn.data_blkaddr); if (err) goto put_err; return page; @@ -709,23 +789,25 @@ static int __allocate_data_block(struct dnode_of_data *dn) struct node_info ni; pgoff_t fofs; blkcnt_t count = 1; + int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return -EPERM; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); if (dn->data_blkaddr == NEW_ADDR) goto alloc; - if (unlikely(!inc_valid_block_count(sbi, dn->inode, &count))) - return -ENOSPC; + if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count)))) + return err; alloc: get_node_info(sbi, dn->nid, &ni); set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); allocate_data_block(sbi, NULL, dn->data_blkaddr, &dn->data_blkaddr, - &sum, CURSEG_WARM_DATA); + &sum, CURSEG_WARM_DATA, NULL, false); set_data_blkaddr(dn); /* update i_size */ @@ -739,7 +821,7 @@ static int __allocate_data_block(struct dnode_of_data *dn) static inline bool __force_buffered_io(struct inode *inode, int rw) { - return ((f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) || + return (f2fs_encrypted_file(inode) || (rw == WRITE && test_opt(F2FS_I_SB(inode), LFS)) || F2FS_I_SB(inode)->s_ndevs); } @@ -750,6 +832,9 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) struct f2fs_map_blocks map; int err = 0; + if (is_inode_flag_set(inode, FI_NO_PREALLOC)) + return 0; + map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos); map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from)); if (map.m_len > map.m_lblk) @@ -768,7 +853,7 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) F2FS_GET_BLOCK_PRE_AIO : F2FS_GET_BLOCK_PRE_DIO); } - if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA) { + if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) { err = f2fs_convert_inline_inode(inode); if (err) return err; @@ -778,6 +863,21 @@ int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from) return err; } +static inline void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock) +{ + if (flag == F2FS_GET_BLOCK_PRE_AIO) { + if (lock) + down_read(&sbi->node_change); + else + up_read(&sbi->node_change); + } else { + if (lock) + f2fs_lock_op(sbi); + else + f2fs_unlock_op(sbi); + } +} + /* * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with * f2fs_map_blocks structure. @@ -798,7 +898,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int err = 0, ofs = 1; unsigned int ofs_in_node, last_ofs_in_node; blkcnt_t prealloc; - struct extent_info ei; + struct extent_info ei = {0,0,0}; block_t blkaddr; if (!maxblocks) @@ -820,7 +920,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, next_dnode: if (create) - f2fs_lock_op(sbi); + __do_map_lock(sbi, flag, true); /* When reading holes, we need its node page */ set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -842,7 +942,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, end_offset = ADDRS_PER_PAGE(dn.node_page, inode); next_block: - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); if (blkaddr == NEW_ADDR || blkaddr == NULL_ADDR) { if (create) { @@ -862,7 +962,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, } if (err) goto sync_out; - map->m_flags = F2FS_MAP_NEW; + map->m_flags |= F2FS_MAP_NEW; blkaddr = dn.data_blkaddr; } else { if (flag == F2FS_GET_BLOCK_BMAP) { @@ -930,7 +1030,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } goto next_dnode; @@ -939,7 +1039,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, f2fs_put_dnode(&dn); unlock_out: if (create) { - f2fs_unlock_op(sbi); + __do_map_lock(sbi, flag, false); f2fs_balance_fs(sbi, dn.node_changed); } out: @@ -962,7 +1062,7 @@ static int __get_data_block(struct inode *inode, sector_t iblock, if (!err) { map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags; - bh->b_size = map.m_len << inode->i_blkbits; + bh->b_size = (u64)map.m_len << inode->i_blkbits; } return err; } @@ -979,7 +1079,7 @@ static int get_data_block_dio(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { return __get_data_block(inode, iblock, bh_result, create, - F2FS_GET_BLOCK_DIO, NULL); + F2FS_GET_BLOCK_DEFAULT, NULL); } static int get_data_block_bmap(struct inode *inode, sector_t iblock, @@ -1085,35 +1185,6 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } -static struct bio *f2fs_grab_bio(struct inode *inode, block_t blkaddr, - unsigned nr_pages) -{ - struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct fscrypt_ctx *ctx = NULL; - struct bio *bio; - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - ctx = fscrypt_get_ctx(inode, GFP_NOFS); - if (IS_ERR(ctx)) - return ERR_CAST(ctx); - - /* wait the page to be moved by cleaning */ - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); - } - - bio = bio_alloc(GFP_KERNEL, min_t(int, nr_pages, BIO_MAX_PAGES)); - if (!bio) { - if (ctx) - fscrypt_release_ctx(ctx); - return ERR_PTR(-ENOMEM); - } - f2fs_target_device(sbi, blkaddr, bio); - bio->bi_end_io = f2fs_read_end_io; - bio->bi_private = ctx; - - return bio; -} - /* * This function was originally taken from fs/mpage.c, and customized for f2fs. * Major change was from block_size == page_size in f2fs by default. @@ -1142,9 +1213,10 @@ static int f2fs_mpage_readpages(struct address_space *mapping, for (page_idx = 0; nr_pages; page_idx++, nr_pages--) { - prefetchw(&page->flags); if (pages) { page = list_last_entry(pages, struct page, lru); + + prefetchw(&page->flags); list_del(&page->lru); if (add_to_page_cache_lru(page, mapping, page->index, GFP_KERNEL)) @@ -1177,7 +1249,7 @@ static int f2fs_mpage_readpages(struct address_space *mapping, map.m_len = last_block - block_in_file; if (f2fs_map_blocks(inode, &map, 0, - F2FS_GET_BLOCK_READ)) + F2FS_GET_BLOCK_DEFAULT)) goto set_error_page; } got_it: @@ -1208,12 +1280,11 @@ static int f2fs_mpage_readpages(struct address_space *mapping, bio = NULL; } if (bio == NULL) { - bio = f2fs_grab_bio(inode, block_nr, nr_pages); + bio = f2fs_grab_read_bio(inode, block_nr, nr_pages); if (IS_ERR(bio)) { bio = NULL; goto set_error_page; } - bio_set_op_attrs(bio, REQ_OP_READ, 0); } if (bio_add_page(bio, page, blocksize, 0) < blocksize) @@ -1273,17 +1344,84 @@ static int f2fs_read_data_pages(struct file *file, return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages); } +static int encrypt_one_page(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + gfp_t gfp_flags = GFP_NOFS; + + if (!f2fs_encrypted_file(inode)) + return 0; + + /* wait for GCed encrypted page writeback */ + f2fs_wait_on_block_writeback(fio->sbi, fio->old_blkaddr); + +retry_encrypt: + fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, + PAGE_SIZE, 0, fio->page->index, gfp_flags); + if (!IS_ERR(fio->encrypted_page)) + return 0; + + /* flush pending IOs and wait for a while in the ENOMEM case */ + if (PTR_ERR(fio->encrypted_page) == -ENOMEM) { + f2fs_flush_merged_writes(fio->sbi); + congestion_wait(BLK_RW_ASYNC, HZ/50); + gfp_flags |= __GFP_NOFAIL; + goto retry_encrypt; + } + return PTR_ERR(fio->encrypted_page); +} + +static inline bool need_inplace_update(struct f2fs_io_info *fio) +{ + struct inode *inode = fio->page->mapping->host; + + if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) + return false; + if (is_cold_data(fio->page)) + return false; + if (IS_ATOMIC_WRITTEN_PAGE(fio->page)) + return false; + + return need_inplace_update_policy(inode, fio); +} + +static inline bool valid_ipu_blkaddr(struct f2fs_io_info *fio) +{ + if (fio->old_blkaddr == NEW_ADDR) + return false; + if (fio->old_blkaddr == NULL_ADDR) + return false; + return true; +} + int do_write_data_page(struct f2fs_io_info *fio) { struct page *page = fio->page; struct inode *inode = page->mapping->host; struct dnode_of_data dn; + struct extent_info ei = {0,0,0}; + bool ipu_force = false; int err = 0; set_new_dnode(&dn, inode, NULL, NULL, 0); + if (need_inplace_update(fio) && + f2fs_lookup_extent_cache(inode, page->index, &ei)) { + fio->old_blkaddr = ei.blk + page->index - ei.fofs; + + if (valid_ipu_blkaddr(fio)) { + ipu_force = true; + fio->need_lock = LOCK_DONE; + goto got_it; + } + } + + /* Deadlock due to between page->lock and f2fs_lock_op */ + if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi)) + return -EAGAIN; + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); if (err) - return err; + goto out; fio->old_blkaddr = dn.data_blkaddr; @@ -1292,57 +1430,57 @@ int do_write_data_page(struct f2fs_io_info *fio) ClearPageUptodate(page); goto out_writepage; } - - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) { - gfp_t gfp_flags = GFP_NOFS; - - /* wait for GCed encrypted page writeback */ - f2fs_wait_on_encrypted_page_writeback(F2FS_I_SB(inode), - fio->old_blkaddr); -retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_page(inode, fio->page, - gfp_flags); - if (IS_ERR(fio->encrypted_page)) { - err = PTR_ERR(fio->encrypted_page); - if (err == -ENOMEM) { - /* flush pending ios and wait for a while */ - f2fs_flush_merged_bios(F2FS_I_SB(inode)); - congestion_wait(BLK_RW_ASYNC, HZ/50); - gfp_flags |= __GFP_NOFAIL; - err = 0; - goto retry_encrypt; - } - goto out_writepage; - } - } - - set_page_writeback(page); - +got_it: /* * If current allocation needs SSR, * it had better in-place writes for updated data. */ - if (unlikely(fio->old_blkaddr != NEW_ADDR && - !is_cold_data(page) && - !IS_ATOMIC_WRITTEN_PAGE(page) && - need_inplace_update(inode))) { - rewrite_data_page(fio); + if (ipu_force || (valid_ipu_blkaddr(fio) && need_inplace_update(fio))) { + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + f2fs_put_dnode(&dn); + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); + err = rewrite_data_page(fio); + trace_f2fs_do_write_data_page(fio->page, IPU); set_inode_flag(inode, FI_UPDATE_WRITE); - trace_f2fs_do_write_data_page(page, IPU); - } else { - write_data_page(&dn, fio); - trace_f2fs_do_write_data_page(page, OPU); - set_inode_flag(inode, FI_APPEND_WRITE); - if (page->index == 0) - set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); + return err; } + + if (fio->need_lock == LOCK_RETRY) { + if (!f2fs_trylock_op(fio->sbi)) { + err = -EAGAIN; + goto out_writepage; + } + fio->need_lock = LOCK_REQ; + } + + err = encrypt_one_page(fio); + if (err) + goto out_writepage; + + set_page_writeback(page); + + /* LFS mode write path */ + write_data_page(&dn, fio); + trace_f2fs_do_write_data_page(page, OPU); + set_inode_flag(inode, FI_APPEND_WRITE); + if (page->index == 0) + set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN); out_writepage: f2fs_put_dnode(&dn); +out: + if (fio->need_lock == LOCK_REQ) + f2fs_unlock_op(fio->sbi); return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) +static int __write_data_page(struct page *page, bool *submitted, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = page->mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1358,12 +1496,19 @@ static int f2fs_write_data_page(struct page *page, .type = DATA, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .submitted = false, + .need_lock = LOCK_RETRY, + .io_type = io_type, }; trace_f2fs_writepage(page, DATA); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (page->index < end_index) goto write; @@ -1377,8 +1522,6 @@ static int f2fs_write_data_page(struct page *page, zero_user_segment(page, offset, PAGE_SIZE); write: - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; if (f2fs_is_drop_cache(inode)) goto out; /* we should not write 0'th page having journal header */ @@ -1395,6 +1538,7 @@ static int f2fs_write_data_page(struct page *page, /* Dentry blocks are controlled by checkpoint */ if (S_ISDIR(inode->i_mode)) { + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); goto done; } @@ -1403,16 +1547,26 @@ static int f2fs_write_data_page(struct page *page, need_balance_fs = true; else if (has_not_enough_free_secs(sbi, 0, 0)) goto redirty_out; + else + set_inode_flag(inode, FI_HOT_DATA); err = -EAGAIN; - f2fs_lock_op(sbi); - if (f2fs_has_inline_data(inode)) + if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, page); - if (err == -EAGAIN) + if (!err) + goto out; + } + + if (err == -EAGAIN) { err = do_write_data_page(&fio); + if (err == -EAGAIN) { + fio.need_lock = LOCK_REQ; + err = do_write_data_page(&fio); + } + } if (F2FS_I(inode)->last_disk_size < psize) F2FS_I(inode)->last_disk_size = psize; - f2fs_unlock_op(sbi); + done: if (err && err != -ENOENT) goto redirty_out; @@ -1423,15 +1577,23 @@ static int f2fs_write_data_page(struct page *page, ClearPageUptodate(page); if (wbc->for_reclaim) { - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, DATA, WRITE); + f2fs_submit_merged_write_cond(sbi, inode, 0, page->index, DATA); + clear_inode_flag(inode, FI_HOT_DATA); remove_dirty_inode(inode); + submitted = NULL; } unlock_page(page); - f2fs_balance_fs(sbi, need_balance_fs); + if (!S_ISDIR(inode->i_mode)) + f2fs_balance_fs(sbi, need_balance_fs); - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, DATA, WRITE); + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, DATA); + submitted = NULL; + } + + if (submitted) + *submitted = fio.submitted; return 0; @@ -1443,13 +1605,20 @@ static int f2fs_write_data_page(struct page *page, return err; } +static int f2fs_write_data_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_data_page(page, NULL, wbc, FS_DATA_IO); +} + /* * This function was copied from write_cche_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from * warm/hot data page. */ static int f2fs_write_cache_pages(struct address_space *mapping, - struct writeback_control *wbc) + struct writeback_control *wbc, + enum iostat_type io_type) { int ret = 0; int done = 0; @@ -1459,13 +1628,19 @@ static int f2fs_write_cache_pages(struct address_space *mapping, pgoff_t index; pgoff_t end; /* Inclusive */ pgoff_t done_index; + pgoff_t last_idx = ULONG_MAX; int cycled; int range_whole = 0; int tag; - int nwritten = 0; pagevec_init(&pvec, 0); + if (get_dirty_pages(mapping->host) <= + SM_I(F2FS_M_SB(mapping))->min_hot_blocks) + set_inode_flag(mapping->host, FI_HOT_DATA); + else + clear_inode_flag(mapping->host, FI_HOT_DATA); + if (wbc->range_cyclic) { writeback_index = mapping->writeback_index; /* prev offset */ index = writeback_index; @@ -1499,6 +1674,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (page->index > end) { done = 1; @@ -1506,7 +1682,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, } done_index = page->index; - +retry_write: lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1532,7 +1708,7 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = mapping->a_ops->writepage(page, wbc); + ret = __write_data_page(page, &submitted, wbc, io_type); if (unlikely(ret)) { /* * keep nr_to_write, since vfs uses this to @@ -1542,16 +1718,27 @@ static int f2fs_write_cache_pages(struct address_space *mapping, unlock_page(page); ret = 0; continue; + } else if (ret == -EAGAIN) { + ret = 0; + if (wbc->sync_mode == WB_SYNC_ALL) { + cond_resched(); + congestion_wait(BLK_RW_ASYNC, + HZ/50); + goto retry_write; + } + continue; } done_index = page->index + 1; done = 1; break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } - if (--wbc->nr_to_write <= 0 && - wbc->sync_mode == WB_SYNC_NONE) { + /* give a priority to WB_SYNC threads */ + if ((atomic_read(&F2FS_M_SB(mapping)->wb_sync_req) || + --wbc->nr_to_write <= 0) && + wbc->sync_mode == WB_SYNC_NONE) { done = 1; break; } @@ -1569,15 +1756,16 @@ static int f2fs_write_cache_pages(struct address_space *mapping, if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) mapping->writeback_index = done_index; - if (nwritten) - f2fs_submit_merged_bio_cond(F2FS_M_SB(mapping), mapping->host, - NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host, + 0, last_idx, DATA); return ret; } -static int f2fs_write_data_pages(struct address_space *mapping, - struct writeback_control *wbc) +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type) { struct inode *inode = mapping->host; struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -1592,6 +1780,10 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; + /* during POR, we don't need to trigger writepage at all. */ + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + if (S_ISDIR(inode->i_mode) && wbc->sync_mode == WB_SYNC_NONE && get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) && available_free_memory(sbi, DIRTY_DENTS)) @@ -1601,15 +1793,20 @@ static int f2fs_write_data_pages(struct address_space *mapping, if (is_inode_flag_set(inode, FI_DO_DEFRAG)) goto skip_write; - /* during POR, we don't need to trigger writepage at all. */ - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto skip_write; - trace_f2fs_writepages(mapping->host, wbc, DATA); + /* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */ + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_inc(&sbi->wb_sync_req); + else if (atomic_read(&sbi->wb_sync_req)) + goto skip_write; + blk_start_plug(&plug); - ret = f2fs_write_cache_pages(mapping, wbc); + ret = f2fs_write_cache_pages(mapping, wbc, io_type); blk_finish_plug(&plug); + + if (wbc->sync_mode == WB_SYNC_ALL) + atomic_dec(&sbi->wb_sync_req); /* * if some pages were truncated, we cannot guarantee its mapping->host * to detect pending bios. @@ -1624,14 +1821,26 @@ static int f2fs_write_data_pages(struct address_space *mapping, return 0; } +static int f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + + return __f2fs_write_data_pages(mapping, wbc, + F2FS_I(inode)->cp_task == current ? + FS_CP_DATA_IO : FS_DATA_IO); +} + static void f2fs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; loff_t i_size = i_size_read(inode); if (to > i_size) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_pagecache(inode, i_size); truncate_blocks(inode, i_size, true); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -1644,19 +1853,20 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct dnode_of_data dn; struct page *ipage; bool locked = false; - struct extent_info ei; + struct extent_info ei = {0,0,0}; int err = 0; /* * we already allocated all the blocks, so we don't need to get * the block addresses when there is no need to fill the page. */ - if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE) + if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE && + !is_inode_flag_set(inode, FI_NO_PREALLOC)) return 0; if (f2fs_has_inline_data(inode) || (pos & PAGE_MASK) >= i_size_read(inode)) { - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, true); locked = true; } restart: @@ -1670,7 +1880,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, set_new_dnode(&dn, inode, ipage, ipage, 0); if (f2fs_has_inline_data(inode)) { - if (pos + len <= MAX_INLINE_DATA) { + if (pos + len <= MAX_INLINE_DATA(inode)) { read_inline_data(page, ipage); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) @@ -1692,7 +1902,8 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, err = get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err || dn.data_blkaddr == NULL_ADDR) { f2fs_put_dnode(&dn); - f2fs_lock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, + true); locked = true; goto restart; } @@ -1706,7 +1917,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, f2fs_put_dnode(&dn); unlock_out: if (locked) - f2fs_unlock_op(sbi); + __do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false); return err; } @@ -1745,7 +1956,11 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, goto fail; } repeat: - page = grab_cache_page_write_begin(mapping, index, flags); + /* + * Do not use grab_cache_page_write_begin() to avoid deadlock due to + * wait_for_stable_page. Will wait that below with our IO control. + */ + page = grab_cache_page(mapping, index); if (!page) { err = -ENOMEM; goto fail; @@ -1772,8 +1987,8 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, blkaddr); if (len == PAGE_SIZE || PageUptodate(page)) return 0; @@ -1787,21 +2002,9 @@ static int f2fs_write_begin(struct file *file, struct address_space *mapping, zero_user_segment(page, 0, PAGE_SIZE); SetPageUptodate(page); } else { - struct bio *bio; - - bio = f2fs_grab_bio(inode, blkaddr, 1); - if (IS_ERR(bio)) { - err = PTR_ERR(bio); + err = f2fs_submit_page_read(inode, page, blkaddr); + if (err) goto fail; - } - bio->bi_rw = READ_SYNC; - if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) { - bio_put(bio); - err = -EFAULT; - goto fail; - } - - __submit_bio(sbi, bio, DATA); lock_page(page); if (unlikely(page->mapping != mapping)) { @@ -1914,10 +2117,13 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, up_read(&F2FS_I(inode)->dio_rwsem[rw]); if (rw == WRITE) { - if (err > 0) + if (err > 0) { + f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO, + err); set_inode_flag(inode, FI_UPDATE_WRITE); - else if (err < 0) + } else if (err < 0) { f2fs_write_failed(mapping, offset + count); + } } if (trace_android_fs_dataread_start_enabled() && @@ -1955,7 +2161,7 @@ void f2fs_invalidate_page(struct page *page, unsigned int offset, /* This is atomic written page, keep Private */ if (IS_ATOMIC_WRITTEN_PAGE(page)) - return; + return drop_inmem_page(inode, page); set_page_private(page, 0); ClearPagePrivate(page); @@ -2064,8 +2270,12 @@ int f2fs_migrate_page(struct address_space *mapping, BUG_ON(PageWriteback(page)); /* migrating an atomic written page is safe with the inmem_lock hold */ - if (atomic_written && !mutex_trylock(&fi->inmem_lock)) - return -EAGAIN; + if (atomic_written) { + if (mode != MIGRATE_SYNC) + return -EBUSY; + if (!mutex_trylock(&fi->inmem_lock)) + return -EAGAIN; + } /* * A reference is expected if PagePrivate set when move mapping, diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index cd338ca24941..87f449845f5f 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -51,9 +51,26 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->inmem_pages = get_pages(sbi, F2FS_INMEM_PAGES); si->aw_cnt = atomic_read(&sbi->aw_cnt); + si->vw_cnt = atomic_read(&sbi->vw_cnt); si->max_aw_cnt = atomic_read(&sbi->max_aw_cnt); + si->max_vw_cnt = atomic_read(&sbi->max_vw_cnt); si->nr_wb_cp_data = get_pages(sbi, F2FS_WB_CP_DATA); si->nr_wb_data = get_pages(sbi, F2FS_WB_DATA); + if (SM_I(sbi) && SM_I(sbi)->fcc_info) { + si->nr_flushed = + atomic_read(&SM_I(sbi)->fcc_info->issued_flush); + si->nr_flushing = + atomic_read(&SM_I(sbi)->fcc_info->issing_flush); + } + if (SM_I(sbi) && SM_I(sbi)->dcc_info) { + si->nr_discarded = + atomic_read(&SM_I(sbi)->dcc_info->issued_discard); + si->nr_discarding = + atomic_read(&SM_I(sbi)->dcc_info->issing_discard); + si->nr_discard_cmd = + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + si->undiscard_blks = SM_I(sbi)->dcc_info->undiscard_blks; + } si->total_count = (int)sbi->user_block_count / sbi->blocks_per_seg; si->rsvd_segs = reserved_segments(sbi); si->overp_segs = overprovision_segments(sbi); @@ -64,6 +81,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->inline_xattr = atomic_read(&sbi->inline_xattr); si->inline_inode = atomic_read(&sbi->inline_inode); si->inline_dir = atomic_read(&sbi->inline_dir); + si->append = sbi->im[APPEND_INO].ino_num; + si->update = sbi->im[UPDATE_INO].ino_num; si->orphans = sbi->im[ORPHAN_INO].ino_num; si->utilization = utilization(sbi); @@ -78,6 +97,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->sits = MAIN_SEGS(sbi); si->dirty_sits = SIT_I(sbi)->dirty_sentries; si->free_nids = NM_I(sbi)->nid_cnt[FREE_NID_LIST]; + si->avail_nids = NM_I(sbi)->available_nids; si->alloc_nids = NM_I(sbi)->nid_cnt[ALLOC_NID_LIST]; si->bg_gc = sbi->bg_gc; si->util_free = (int)(free_user_blocks(sbi) >> sbi->log_blocks_per_seg) @@ -91,8 +111,8 @@ static void update_general_status(struct f2fs_sb_info *sbi) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_NODE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); si->curseg[i] = curseg->segno; - si->cursec[i] = curseg->segno / sbi->segs_per_sec; - si->curzone[i] = si->cursec[i] / sbi->secs_per_zone; + si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); + si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); } for (i = 0; i < 2; i++) { @@ -116,10 +136,10 @@ static void update_sit_info(struct f2fs_sb_info *sbi) bimodal = 0; total_vblocks = 0; - blks_per_sec = sbi->segs_per_sec * sbi->blocks_per_seg; + blks_per_sec = BLKS_PER_SEC(sbi); hblks_per_sec = blks_per_sec / 2; for (segno = 0; segno < MAIN_SEGS(sbi); segno += sbi->segs_per_sec) { - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); dist = abs(vblocks - hblks_per_sec); bimodal += dist * dist; @@ -148,7 +168,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) if (si->base_mem) goto get_cache; - si->base_mem = sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; + /* build stat */ + si->base_mem = sizeof(struct f2fs_stat_info); + + /* build superblock */ + si->base_mem += sizeof(struct f2fs_sb_info) + sbi->sb->s_blocksize; si->base_mem += 2 * sizeof(struct f2fs_inode_info); si->base_mem += sizeof(*sbi->ckpt); si->base_mem += sizeof(struct percpu_counter) * NR_COUNT_TYPE; @@ -185,6 +209,10 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build nm */ si->base_mem += sizeof(struct f2fs_nm_info); si->base_mem += __bitmap_size(sbi, NAT_BITMAP); + si->base_mem += (NM_I(sbi)->nat_bits_blocks << F2FS_BLKSIZE_BITS); + si->base_mem += NM_I(sbi)->nat_blocks * NAT_ENTRY_BITMAP_SIZE; + si->base_mem += NM_I(sbi)->nat_blocks / 8; + si->base_mem += NM_I(sbi)->nat_blocks * sizeof(unsigned short); get_cache: si->cache_mem = 0; @@ -196,6 +224,11 @@ static void update_mem_info(struct f2fs_sb_info *sbi) /* build merge flush thread */ if (SM_I(sbi)->fcc_info) si->cache_mem += sizeof(struct flush_cmd_control); + if (SM_I(sbi)->dcc_info) { + si->cache_mem += sizeof(struct discard_cmd_control); + si->cache_mem += sizeof(struct discard_cmd) * + atomic_read(&SM_I(sbi)->dcc_info->discard_cmd_cnt); + } /* free nids */ si->cache_mem += (NM_I(sbi)->nid_cnt[FREE_NID_LIST] + @@ -256,8 +289,8 @@ static int stat_show(struct seq_file *s, void *v) si->inline_inode); seq_printf(s, " - Inline_dentry Inode: %u\n", si->inline_dir); - seq_printf(s, " - Orphan Inode: %u\n", - si->orphans); + seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", + si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", si->main_area_segs, si->main_area_sections, si->main_area_zones); @@ -316,10 +349,16 @@ static int stat_show(struct seq_file *s, void *v) seq_printf(s, " - Inner Struct Count: tree: %d(%d), node: %d\n", si->ext_tree, si->zombie_tree, si->ext_node); seq_puts(s, "\nBalancing F2FS Async:\n"); - seq_printf(s, " - IO (CP: %4d, Data: %4d)\n", - si->nr_wb_cp_data, si->nr_wb_data); - seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d)\n", - si->inmem_pages, si->aw_cnt, si->max_aw_cnt); + seq_printf(s, " - IO (CP: %4d, Data: %4d, Flush: (%4d %4d), " + "Discard: (%4d %4d)) cmd: %4d undiscard:%4u\n", + si->nr_wb_cp_data, si->nr_wb_data, + si->nr_flushing, si->nr_flushed, + si->nr_discarding, si->nr_discarded, + si->nr_discard_cmd, si->undiscard_blks); + seq_printf(s, " - inmem: %4d, atomic IO: %4d (Max. %4d), " + "volatile IO: %4d (Max. %4d)\n", + si->inmem_pages, si->aw_cnt, si->max_aw_cnt, + si->vw_cnt, si->max_vw_cnt); seq_printf(s, " - nodes: %4d in %4d\n", si->ndirty_node, si->node_pages); seq_printf(s, " - dents: %4d in dirs:%4d (%4d)\n", @@ -332,8 +371,8 @@ static int stat_show(struct seq_file *s, void *v) si->ndirty_imeta); seq_printf(s, " - NATs: %9d/%9d\n - SITs: %9d/%9d\n", si->dirty_nats, si->nats, si->dirty_sits, si->sits); - seq_printf(s, " - free_nids: %9d, alloc_nids: %9d\n", - si->free_nids, si->alloc_nids); + seq_printf(s, " - free_nids: %9d/%9d\n - alloc_nids: %9d\n", + si->free_nids, si->avail_nids, si->alloc_nids); seq_puts(s, "\nDistribution of User Blocks:"); seq_puts(s, " [ valid | invalid | free ]\n"); seq_puts(s, " ["); @@ -419,7 +458,9 @@ int f2fs_build_stats(struct f2fs_sb_info *sbi) atomic_set(&sbi->inplace_count, 0); atomic_set(&sbi->aw_cnt, 0); + atomic_set(&sbi->vw_cnt, 0); atomic_set(&sbi->max_aw_cnt, 0); + atomic_set(&sbi->max_vw_cnt, 0); mutex_lock(&f2fs_stat_mutex); list_add_tail(&si->stat_list, &f2fs_stat_list); diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 4436079dbf0c..4f2a8fedb313 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -94,7 +94,7 @@ static struct f2fs_dir_entry *find_in_block(struct page *dentry_page, dentry_blk = (struct f2fs_dentry_block *)kmap(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); de = find_target_dentry(fname, namehash, max_slots, &d); if (de) *res_page = dentry_page; @@ -111,8 +111,6 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, struct f2fs_dir_entry *de; unsigned long bit_pos = 0; int max_len = 0; - struct fscrypt_str de_name = FSTR_INIT(NULL, 0); - struct fscrypt_str *name = &fname->disk_name; if (max_slots) *max_slots = 0; @@ -130,17 +128,9 @@ struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, continue; } - /* encrypted case */ - de_name.name = d->filename[bit_pos]; - de_name.len = le16_to_cpu(de->name_len); - - /* show encrypted name */ - if (fname->hash) { - if (de->hash_code == cpu_to_le32(fname->hash)) - goto found; - } else if (de_name.len == name->len && - de->hash_code == namehash && - !memcmp(de_name.name, name->name, name->len)) + if (de->hash_code == namehash && + fscrypt_match_name(fname, d->filename[bit_pos], + le16_to_cpu(de->name_len))) goto found; if (max_slots && max_len > *max_slots) @@ -170,12 +160,7 @@ static struct f2fs_dir_entry *find_in_level(struct inode *dir, struct f2fs_dir_entry *de = NULL; bool room = false; int max_slots; - f2fs_hash_t namehash; - - if(fname->hash) - namehash = cpu_to_le32(fname->hash); - else - namehash = f2fs_dentry_hash(&name); + f2fs_hash_t namehash = f2fs_dentry_hash(&name, fname); nbucket = dir_buckets(level, F2FS_I(dir)->i_dir_level); nblock = bucket_blocks(level); @@ -250,6 +235,9 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, break; } out: + /* This is to increase the speed of f2fs_create */ + if (!de) + F2FS_I(dir)->task = current; return de; } @@ -268,7 +256,10 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = fscrypt_setup_filename(dir, child, 1, &fname); if (err) { - *res_page = ERR_PTR(err); + if (err == -ENOENT) + *res_page = NULL; + else + *res_page = ERR_PTR(err); return NULL; } @@ -330,24 +321,6 @@ static void init_dent_inode(const struct qstr *name, struct page *ipage) set_page_dirty(ipage); } -int update_dent_inode(struct inode *inode, struct inode *to, - const struct qstr *name) -{ - struct page *page; - - if (file_enc_name(to)) - return 0; - - page = get_node_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); - - init_dent_inode(name, page); - f2fs_put_page(page, 1); - - return 0; -} - void do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d) { @@ -377,7 +350,7 @@ static int make_empty_dir(struct inode *inode, dentry_blk = kmap_atomic(dentry_page); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); do_make_empty_dir(inode, parent, &d); kunmap_atomic(dentry_blk); @@ -431,15 +404,19 @@ struct page *init_inode_metadata(struct inode *inode, struct inode *dir, set_cold_node(inode, page); } - if (new_name) + if (new_name) { init_dent_inode(new_name, page); + if (f2fs_encrypted_inode(dir)) + file_set_enc_name(inode); + } /* * This file should be checkpointed during fsync. * We lost i_pino from now on. */ if (is_inode_flag_set(inode, FI_INC_LINK)) { - file_lost_pino(inode); + if (!S_ISDIR(inode->i_mode)) + file_lost_pino(inode); /* * If link the tmpfile to alias through linkat path, * we should remove this inode from orphan list. @@ -535,7 +512,7 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, level = 0; slots = GET_DENTRY_SLOTS(new_name->len); - dentry_hash = f2fs_dentry_hash(new_name); + dentry_hash = f2fs_dentry_hash(new_name, NULL); current_depth = F2FS_I(dir)->i_current_depth; if (F2FS_I(dir)->chash == dentry_hash) { @@ -545,8 +522,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, start: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) + if (time_to_inject(F2FS_I_SB(dir), FAULT_DIR_DEPTH)) { + f2fs_show_injection_info(FAULT_DIR_DEPTH); return -ENOSPC; + } #endif if (unlikely(current_depth == MAX_DIR_HASH_DEPTH)) return -ENOSPC; @@ -590,11 +569,9 @@ int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_update_dentry(ino, mode, &d, new_name, dentry_hash, bit_pos); set_page_dirty(dentry_page); @@ -643,14 +620,34 @@ int __f2fs_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct fscrypt_name fname; + struct page *page = NULL; + struct f2fs_dir_entry *de = NULL; int err; err = fscrypt_setup_filename(dir, name, 0, &fname); if (err) return err; - err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); - + /* + * An immature stakable filesystem shows a race condition between lookup + * and create. If we have same task when doing lookup and create, it's + * definitely fine as expected by VFS normally. Otherwise, let's just + * verify on-disk dentry one more time, which guarantees filesystem + * consistency more. + */ + if (current != F2FS_I(dir)->task) { + de = __f2fs_find_entry(dir, &fname, &page); + F2FS_I(dir)->task = NULL; + } + if (de) { + f2fs_dentry_kunmap(dir, page); + f2fs_put_page(page, 0); + err = -EEXIST; + } else if (IS_ERR(page)) { + err = PTR_ERR(page); + } else { + err = __f2fs_do_add_link(dir, &fname, inode, ino, mode); + } fscrypt_free_filename(&fname); return err; } @@ -708,6 +705,8 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); + struct address_space *mapping = page_mapping(page); + unsigned long flags; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -721,7 +720,7 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, dentry_blk = page_address(page); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) - clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); + __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); /* Let's check and deallocate this dentry page */ bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, @@ -738,6 +737,11 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, if (bit_pos == NR_DENTRY_IN_BLOCK && !truncate_hole(dir, page->index, page->index + 1)) { + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + clear_page_dirty_for_io(page); ClearPagePrivate(page); ClearPageUptodate(page); @@ -882,7 +886,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) dentry_blk = kmap(dentry_page); - make_dentry_ptr(inode, &d, (void *)dentry_blk, 1); + make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 6ed6424807b6..ff2352a0ed15 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -18,6 +18,179 @@ #include "node.h" #include +static struct rb_entry *__lookup_rb_tree_fast(struct rb_entry *cached_re, + unsigned int ofs) +{ + if (cached_re) { + if (cached_re->ofs <= ofs && + cached_re->ofs + cached_re->len > ofs) { + return cached_re; + } + } + return NULL; +} + +static struct rb_entry *__lookup_rb_tree_slow(struct rb_root *root, + unsigned int ofs) +{ + struct rb_node *node = root->rb_node; + struct rb_entry *re; + + while (node) { + re = rb_entry(node, struct rb_entry, rb_node); + + if (ofs < re->ofs) + node = node->rb_left; + else if (ofs >= re->ofs + re->len) + node = node->rb_right; + else + return re; + } + return NULL; +} + +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs) +{ + struct rb_entry *re; + + re = __lookup_rb_tree_fast(cached_re, ofs); + if (!re) + return __lookup_rb_tree_slow(root, ofs); + + return re; +} + +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs) +{ + struct rb_node **p = &root->rb_node; + struct rb_entry *re; + + while (*p) { + *parent = *p; + re = rb_entry(*parent, struct rb_entry, rb_node); + + if (ofs < re->ofs) + p = &(*p)->rb_left; + else if (ofs >= re->ofs + re->len) + p = &(*p)->rb_right; + else + f2fs_bug_on(sbi, 1); + } + + return p; +} + +/* + * lookup rb entry in position of @ofs in rb-tree, + * if hit, return the entry, otherwise, return NULL + * @prev_ex: extent before ofs + * @next_ex: extent after ofs + * @insert_p: insert point for new extent at ofs + * in order to simpfy the insertion after. + * tree must stay unchanged between lookup and insertion. + */ +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, + unsigned int ofs, + struct rb_entry **prev_entry, + struct rb_entry **next_entry, + struct rb_node ***insert_p, + struct rb_node **insert_parent, + bool force) +{ + struct rb_node **pnode = &root->rb_node; + struct rb_node *parent = NULL, *tmp_node; + struct rb_entry *re = cached_re; + + *insert_p = NULL; + *insert_parent = NULL; + *prev_entry = NULL; + *next_entry = NULL; + + if (RB_EMPTY_ROOT(root)) + return NULL; + + if (re) { + if (re->ofs <= ofs && re->ofs + re->len > ofs) + goto lookup_neighbors; + } + + while (*pnode) { + parent = *pnode; + re = rb_entry(*pnode, struct rb_entry, rb_node); + + if (ofs < re->ofs) + pnode = &(*pnode)->rb_left; + else if (ofs >= re->ofs + re->len) + pnode = &(*pnode)->rb_right; + else + goto lookup_neighbors; + } + + *insert_p = pnode; + *insert_parent = parent; + + re = rb_entry(parent, struct rb_entry, rb_node); + tmp_node = parent; + if (parent && ofs > re->ofs) + tmp_node = rb_next(parent); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + + tmp_node = parent; + if (parent && ofs < re->ofs) + tmp_node = rb_prev(parent); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + return NULL; + +lookup_neighbors: + if (ofs == re->ofs || force) { + /* lookup prev node for merging backward later */ + tmp_node = rb_prev(&re->rb_node); + *prev_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + if (ofs == re->ofs + re->len - 1 || force) { + /* lookup next node for merging frontward later */ + tmp_node = rb_next(&re->rb_node); + *next_entry = rb_entry_safe(tmp_node, struct rb_entry, rb_node); + } + return re; +} + +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct rb_node *cur = rb_first(root), *next; + struct rb_entry *cur_re, *next_re; + + if (!cur) + return true; + + while (cur) { + next = rb_next(cur); + if (!next) + return true; + + cur_re = rb_entry(cur, struct rb_entry, rb_node); + next_re = rb_entry(next, struct rb_entry, rb_node); + + if (cur_re->ofs + cur_re->len > next_re->ofs) { + f2fs_msg(sbi->sb, KERN_INFO, "inconsistent rbtree, " + "cur(%u, %u) next(%u, %u)", + cur_re->ofs, cur_re->len, + next_re->ofs, next_re->len); + return false; + } + + cur = next; + } +#endif + return true; +} + static struct kmem_cache *extent_tree_slab; static struct kmem_cache *extent_node_slab; @@ -77,7 +250,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) struct extent_tree *et; nid_t ino = inode->i_ino; - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); et = radix_tree_lookup(&sbi->extent_tree_root, ino); if (!et) { et = f2fs_kmem_cache_alloc(extent_tree_slab, GFP_NOFS); @@ -94,7 +267,7 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) atomic_dec(&sbi->total_zombie_tree); list_del_init(&et->list); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); /* never died until evict_inode */ F2FS_I(inode)->extent_tree = et; @@ -102,36 +275,6 @@ static struct extent_tree *__grab_extent_tree(struct inode *inode) return et; } -static struct extent_node *__lookup_extent_tree(struct f2fs_sb_info *sbi, - struct extent_tree *et, unsigned int fofs) -{ - struct rb_node *node = et->root.rb_node; - struct extent_node *en = et->cached_en; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) { - stat_inc_cached_node_hit(sbi); - return en; - } - } - - while (node) { - en = rb_entry(node, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) { - node = node->rb_left; - } else if (fofs >= en->ei.fofs + en->ei.len) { - node = node->rb_right; - } else { - stat_inc_rbtree_node_hit(sbi); - return en; - } - } - return NULL; -} - static struct extent_node *__init_extent_tree(struct f2fs_sb_info *sbi, struct extent_tree *et, struct extent_info *ei) { @@ -177,7 +320,7 @@ static void __drop_largest_extent(struct inode *inode, } /* return true, if inode page is changed */ -bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +static bool __f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree *et; @@ -215,6 +358,16 @@ bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) return false; } +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext) +{ + bool ret = __f2fs_init_extent_tree(inode, i_ext); + + if (!F2FS_I(inode)->extent_tree) + set_inode_flag(inode, FI_NO_EXTENT); + + return ret; +} + static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, struct extent_info *ei) { @@ -237,17 +390,24 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, goto out; } - en = __lookup_extent_tree(sbi, et, pgofs); - if (en) { - *ei = en->ei; - spin_lock(&sbi->extent_lock); - if (!list_empty(&en->list)) { - list_move_tail(&en->list, &sbi->extent_list); - et->cached_en = en; - } - spin_unlock(&sbi->extent_lock); - ret = true; + en = (struct extent_node *)__lookup_rb_tree(&et->root, + (struct rb_entry *)et->cached_en, pgofs); + if (!en) + goto out; + + if (en == et->cached_en) + stat_inc_cached_node_hit(sbi); + else + stat_inc_rbtree_node_hit(sbi); + + *ei = en->ei; + spin_lock(&sbi->extent_lock); + if (!list_empty(&en->list)) { + list_move_tail(&en->list, &sbi->extent_list); + et->cached_en = en; } + spin_unlock(&sbi->extent_lock); + ret = true; out: stat_inc_total_hit(sbi); read_unlock(&et->lock); @@ -256,83 +416,6 @@ static bool f2fs_lookup_extent_tree(struct inode *inode, pgoff_t pgofs, return ret; } - -/* - * lookup extent at @fofs, if hit, return the extent - * if not, return NULL and - * @prev_ex: extent before fofs - * @next_ex: extent after fofs - * @insert_p: insert point for new extent at fofs - * in order to simpfy the insertion after. - * tree must stay unchanged between lookup and insertion. - */ -static struct extent_node *__lookup_extent_tree_ret(struct extent_tree *et, - unsigned int fofs, - struct extent_node **prev_ex, - struct extent_node **next_ex, - struct rb_node ***insert_p, - struct rb_node **insert_parent) -{ - struct rb_node **pnode = &et->root.rb_node; - struct rb_node *parent = NULL, *tmp_node; - struct extent_node *en = et->cached_en; - - *insert_p = NULL; - *insert_parent = NULL; - *prev_ex = NULL; - *next_ex = NULL; - - if (RB_EMPTY_ROOT(&et->root)) - return NULL; - - if (en) { - struct extent_info *cei = &en->ei; - - if (cei->fofs <= fofs && cei->fofs + cei->len > fofs) - goto lookup_neighbors; - } - - while (*pnode) { - parent = *pnode; - en = rb_entry(*pnode, struct extent_node, rb_node); - - if (fofs < en->ei.fofs) - pnode = &(*pnode)->rb_left; - else if (fofs >= en->ei.fofs + en->ei.len) - pnode = &(*pnode)->rb_right; - else - goto lookup_neighbors; - } - - *insert_p = pnode; - *insert_parent = parent; - - en = rb_entry(parent, struct extent_node, rb_node); - tmp_node = parent; - if (parent && fofs > en->ei.fofs) - tmp_node = rb_next(parent); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - - tmp_node = parent; - if (parent && fofs < en->ei.fofs) - tmp_node = rb_prev(parent); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - return NULL; - -lookup_neighbors: - if (fofs == en->ei.fofs) { - /* lookup prev node for merging backward later */ - tmp_node = rb_prev(&en->rb_node); - *prev_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - if (fofs == en->ei.fofs + en->ei.len - 1) { - /* lookup next node for merging frontward later */ - tmp_node = rb_next(&en->rb_node); - *next_ex = rb_entry_safe(tmp_node, struct extent_node, rb_node); - } - return en; -} - static struct extent_node *__try_merge_extent_node(struct inode *inode, struct extent_tree *et, struct extent_info *ei, struct extent_node *prev_ex, @@ -387,17 +470,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, goto do_insert; } - while (*p) { - parent = *p; - en = rb_entry(parent, struct extent_node, rb_node); - - if (ei->fofs < en->ei.fofs) - p = &(*p)->rb_left; - else if (ei->fofs >= en->ei.fofs + en->ei.len) - p = &(*p)->rb_right; - else - f2fs_bug_on(sbi, 1); - } + p = __lookup_rb_tree_for_insert(sbi, &et->root, &parent, ei->fofs); do_insert: en = __attach_extent_node(sbi, et, ei, parent, p); if (!en) @@ -413,7 +486,7 @@ static struct extent_node *__insert_extent_tree(struct inode *inode, return en; } -static unsigned int f2fs_update_extent_tree_range(struct inode *inode, +static void f2fs_update_extent_tree_range(struct inode *inode, pgoff_t fofs, block_t blkaddr, unsigned int len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); @@ -426,7 +499,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, unsigned int pos = (unsigned int)fofs; if (!et) - return false; + return; trace_f2fs_update_extent_tree_range(inode, fofs, blkaddr, len); @@ -434,7 +507,7 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, if (is_inode_flag_set(inode, FI_NO_EXTENT)) { write_unlock(&et->lock); - return false; + return; } prev = et->largest; @@ -447,8 +520,11 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __drop_largest_extent(inode, fofs, len); /* 1. lookup first extent node in range [fofs, fofs + len - 1] */ - en = __lookup_extent_tree_ret(et, fofs, &prev_en, &next_en, - &insert_p, &insert_parent); + en = (struct extent_node *)__lookup_rb_tree_ret(&et->root, + (struct rb_entry *)et->cached_en, fofs, + (struct rb_entry **)&prev_en, + (struct rb_entry **)&next_en, + &insert_p, &insert_parent, false); if (!en) en = next_en; @@ -531,8 +607,6 @@ static unsigned int f2fs_update_extent_tree_range(struct inode *inode, __free_extent_tree(sbi, et); write_unlock(&et->lock); - - return !__is_extent_same(&prev, &et->largest); } unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) @@ -548,7 +622,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) if (!atomic_read(&sbi->total_zombie_tree)) goto free_node; - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; /* 1. remove unreferenced extent tree */ @@ -570,11 +644,11 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) goto unlock_out; cond_resched(); } - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); free_node: /* 2. remove LRU extent entries */ - if (!down_write_trylock(&sbi->extent_tree_lock)) + if (!mutex_trylock(&sbi->extent_tree_lock)) goto out; remained = nr_shrink - (node_cnt + tree_cnt); @@ -604,7 +678,7 @@ unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink) spin_unlock(&sbi->extent_lock); unlock_out: - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); out: trace_f2fs_shrink_extent_tree(sbi, node_cnt, tree_cnt); @@ -651,10 +725,10 @@ void f2fs_destroy_extent_tree(struct inode *inode) if (inode->i_nlink && !is_bad_inode(inode) && atomic_read(&et->node_cnt)) { - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); list_add_tail(&et->list, &sbi->zombie_list); atomic_inc(&sbi->total_zombie_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); return; } @@ -662,12 +736,12 @@ void f2fs_destroy_extent_tree(struct inode *inode) node_cnt = f2fs_destroy_extent_node(inode); /* delete extent tree entry in radix tree */ - down_write(&sbi->extent_tree_lock); + mutex_lock(&sbi->extent_tree_lock); f2fs_bug_on(sbi, atomic_read(&et->node_cnt)); radix_tree_delete(&sbi->extent_tree_root, inode->i_ino); kmem_cache_free(extent_tree_slab, et); atomic_dec(&sbi->total_ext_tree); - up_write(&sbi->extent_tree_lock); + mutex_unlock(&sbi->extent_tree_lock); F2FS_I(inode)->extent_tree = NULL; @@ -714,7 +788,7 @@ void f2fs_update_extent_cache_range(struct dnode_of_data *dn, void init_extent_cache_info(struct f2fs_sb_info *sbi) { INIT_RADIX_TREE(&sbi->extent_tree_root, GFP_NOIO); - init_rwsem(&sbi->extent_tree_lock); + mutex_init(&sbi->extent_tree_lock); INIT_LIST_HEAD(&sbi->extent_list); spin_lock_init(&sbi->extent_lock); atomic_set(&sbi->total_ext_tree, 0); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 167c5f841b5f..ff694127243a 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -22,7 +22,12 @@ #include #include #include -#include +#include +#ifdef CONFIG_F2FS_FS_ENCRYPTION +#include +#else +#include +#endif #include #include @@ -47,6 +52,7 @@ enum { FAULT_BLOCK, FAULT_DIR_DEPTH, FAULT_EVICT_INODE, + FAULT_TRUNCATE, FAULT_IO, FAULT_CHECKPOINT, FAULT_MAX, @@ -59,7 +65,7 @@ struct f2fs_fault_info { }; extern char *fault_name[FAULT_MAX]; -#define IS_FAULT_SET(fi, type) (fi->inject_type & (1 << (type))) +#define IS_FAULT_SET(fi, type) ((fi)->inject_type & (1 << (type))) #endif /* @@ -84,10 +90,14 @@ extern char *fault_name[FAULT_MAX]; #define F2FS_MOUNT_FAULT_INJECTION 0x00010000 #define F2FS_MOUNT_ADAPTIVE 0x00020000 #define F2FS_MOUNT_LFS 0x00040000 +#define F2FS_MOUNT_USRQUOTA 0x00080000 +#define F2FS_MOUNT_GRPQUOTA 0x00100000 +#define F2FS_MOUNT_PRJQUOTA 0x00200000 +#define F2FS_MOUNT_QUOTA 0x00400000 -#define clear_opt(sbi, option) (sbi->mount_opt.opt &= ~F2FS_MOUNT_##option) -#define set_opt(sbi, option) (sbi->mount_opt.opt |= F2FS_MOUNT_##option) -#define test_opt(sbi, option) (sbi->mount_opt.opt & F2FS_MOUNT_##option) +#define clear_opt(sbi, option) ((sbi)->mount_opt.opt &= ~F2FS_MOUNT_##option) +#define set_opt(sbi, option) ((sbi)->mount_opt.opt |= F2FS_MOUNT_##option) +#define test_opt(sbi, option) ((sbi)->mount_opt.opt & F2FS_MOUNT_##option) #define ver_after(a, b) (typecheck(unsigned long long, a) && \ typecheck(unsigned long long, b) && \ @@ -103,15 +113,19 @@ struct f2fs_mount_info { unsigned int opt; }; -#define F2FS_FEATURE_ENCRYPT 0x0001 -#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ENCRYPT 0x0001 +#define F2FS_FEATURE_BLKZONED 0x0002 +#define F2FS_FEATURE_ATOMIC_WRITE 0x0004 +#define F2FS_FEATURE_EXTRA_ATTR 0x0008 +#define F2FS_FEATURE_PRJQUOTA 0x0010 +#define F2FS_FEATURE_INODE_CHKSUM 0x0020 #define F2FS_HAS_FEATURE(sb, mask) \ ((F2FS_SB(sb)->raw_super->feature & cpu_to_le32(mask)) != 0) #define F2FS_SET_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature |= cpu_to_le32(mask)) #define F2FS_CLEAR_FEATURE(sb, mask) \ - F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask) + (F2FS_SB(sb)->raw_super->feature &= ~cpu_to_le32(mask)) /* bio stuffs */ #define REQ_OP_READ READ @@ -187,19 +201,22 @@ enum { SIT_BITMAP }; -enum { - CP_UMOUNT, - CP_FASTBOOT, - CP_SYNC, - CP_RECOVERY, - CP_DISCARD, -}; +#define CP_UMOUNT 0x00000001 +#define CP_FASTBOOT 0x00000002 +#define CP_SYNC 0x00000004 +#define CP_RECOVERY 0x00000008 +#define CP_DISCARD 0x00000010 +#define CP_TRIMMED 0x00000020 -#define DEF_BATCHED_TRIM_SECTIONS 2 +#define DEF_BATCHED_TRIM_SECTIONS 2048 #define BATCHED_TRIM_SEGMENTS(sbi) \ - (SM_I(sbi)->trim_sections * (sbi)->segs_per_sec) + (GET_SEG_FROM_SEC(sbi, SM_I(sbi)->trim_sections)) #define BATCHED_TRIM_BLOCKS(sbi) \ (BATCHED_TRIM_SEGMENTS(sbi) << (sbi)->log_blocks_per_seg) +#define MAX_DISCARD_BLOCKS(sbi) BLKS_PER_SEC(sbi) +#define DISCARD_ISSUE_RATE 8 +#define DEF_MIN_DISCARD_ISSUE_TIME 50 /* 50 ms, if exists */ +#define DEF_MAX_DISCARD_ISSUE_TIME 60000 /* 60 s, if no candidates */ #define DEF_CP_INTERVAL 60 /* 60 secs */ #define DEF_IDLE_INTERVAL 5 /* 5 secs */ @@ -241,19 +258,73 @@ struct inode_entry { struct inode *inode; /* vfs inode pointer */ }; -/* for the list of blockaddresses to be discarded */ +/* for the bitmap indicate blocks to be discarded */ struct discard_entry { struct list_head list; /* list head */ - block_t blkaddr; /* block address to be discarded */ - int len; /* # of consecutive blocks of the discard */ + block_t start_blkaddr; /* start blockaddr of current segment */ + unsigned char discard_map[SIT_VBLOCK_MAP_SIZE]; /* segment discard bitmap */ +}; + +/* default discard granularity of inner discard thread, unit: block count */ +#define DEFAULT_DISCARD_GRANULARITY 16 + +/* max discard pend list number */ +#define MAX_PLIST_NUM 512 +#define plist_idx(blk_num) ((blk_num) >= MAX_PLIST_NUM ? \ + (MAX_PLIST_NUM - 1) : (blk_num - 1)) + +#define P_ACTIVE 0x01 +#define P_TRIM 0x02 +#define plist_issue(tag) (((tag) & P_ACTIVE) || ((tag) & P_TRIM)) + +enum { + D_PREP, + D_SUBMIT, + D_DONE, +}; + +struct discard_info { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ }; struct discard_cmd { + struct rb_node rb_node; /* rb node located in rb-tree */ + union { + struct { + block_t lstart; /* logical start address */ + block_t len; /* length */ + block_t start; /* actual start address in dev */ + }; + struct discard_info di; /* discard info */ + + }; struct list_head list; /* command list */ struct completion wait; /* compleation */ - block_t lstart; /* logical start address */ - block_t len; /* length */ - struct bio *bio; /* bio */ + struct block_device *bdev; /* bdev */ + unsigned short ref; /* reference count */ + unsigned char state; /* state */ + int error; /* bio error */ +}; + +struct discard_cmd_control { + struct task_struct *f2fs_issue_discard; /* discard thread */ + struct list_head entry_list; /* 4KB discard entry list */ + struct list_head pend_list[MAX_PLIST_NUM];/* store pending entries */ + unsigned char pend_list_tag[MAX_PLIST_NUM];/* tag for pending entries */ + struct list_head wait_list; /* store on-flushing entries */ + wait_queue_head_t discard_wait_queue; /* waiting queue for wake-up */ + unsigned int discard_wake; /* to wake up discard thread */ + struct mutex cmd_lock; + unsigned int nr_discards; /* # of discards in the list */ + unsigned int max_discards; /* max. discards to be issued */ + unsigned int discard_granularity; /* discard granularity */ + unsigned int undiscard_blks; /* # of undiscard blocks */ + atomic_t issued_discard; /* # of issued discard */ + atomic_t issing_discard; /* # of issing discard */ + atomic_t discard_cmd_cnt; /* # of cached cmd count */ + struct rb_root root; /* root of discard rb-tree */ }; /* for the list of fsync inodes, used only during recovery */ @@ -264,13 +335,13 @@ struct fsync_inode_entry { block_t last_dentry; /* block address locating the last dentry */ }; -#define nats_in_cursum(jnl) (le16_to_cpu(jnl->n_nats)) -#define sits_in_cursum(jnl) (le16_to_cpu(jnl->n_sits)) +#define nats_in_cursum(jnl) (le16_to_cpu((jnl)->n_nats)) +#define sits_in_cursum(jnl) (le16_to_cpu((jnl)->n_sits)) -#define nat_in_journal(jnl, i) (jnl->nat_j.entries[i].ne) -#define nid_in_journal(jnl, i) (jnl->nat_j.entries[i].nid) -#define sit_in_journal(jnl, i) (jnl->sit_j.entries[i].se) -#define segno_in_journal(jnl, i) (jnl->sit_j.entries[i].segno) +#define nat_in_journal(jnl, i) ((jnl)->nat_j.entries[i].ne) +#define nid_in_journal(jnl, i) ((jnl)->nat_j.entries[i].nid) +#define sit_in_journal(jnl, i) ((jnl)->sit_j.entries[i].se) +#define segno_in_journal(jnl, i) ((jnl)->sit_j.entries[i].segno) #define MAX_NAT_JENTRIES(jnl) (NAT_JOURNAL_ENTRIES - nats_in_cursum(jnl)) #define MAX_SIT_JENTRIES(jnl) (SIT_JOURNAL_ENTRIES - sits_in_cursum(jnl)) @@ -278,6 +349,7 @@ struct fsync_inode_entry { static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) { int before = nats_in_cursum(journal); + journal->n_nats = cpu_to_le16(before + i); return before; } @@ -285,6 +357,7 @@ static inline int update_nats_in_cursum(struct f2fs_journal *journal, int i) static inline int update_sits_in_cursum(struct f2fs_journal *journal, int i) { int before = sits_in_cursum(journal); + journal->n_sits = cpu_to_le16(before + i); return before; } @@ -310,11 +383,17 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC_START_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 3) #define F2FS_IOC_RELEASE_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 4) #define F2FS_IOC_ABORT_VOLATILE_WRITE _IO(F2FS_IOCTL_MAGIC, 5) -#define F2FS_IOC_GARBAGE_COLLECT _IO(F2FS_IOCTL_MAGIC, 6) +#define F2FS_IOC_GARBAGE_COLLECT _IOW(F2FS_IOCTL_MAGIC, 6, __u32) #define F2FS_IOC_WRITE_CHECKPOINT _IO(F2FS_IOCTL_MAGIC, 7) -#define F2FS_IOC_DEFRAGMENT _IO(F2FS_IOCTL_MAGIC, 8) +#define F2FS_IOC_DEFRAGMENT _IOWR(F2FS_IOCTL_MAGIC, 8, \ + struct f2fs_defragment) #define F2FS_IOC_MOVE_RANGE _IOWR(F2FS_IOCTL_MAGIC, 9, \ struct f2fs_move_range) +#define F2FS_IOC_FLUSH_DEVICE _IOW(F2FS_IOCTL_MAGIC, 10, \ + struct f2fs_flush_device) +#define F2FS_IOC_GARBAGE_COLLECT_RANGE _IOW(F2FS_IOCTL_MAGIC, 11, \ + struct f2fs_gc_range) +#define F2FS_IOC_GET_FEATURES _IOR(F2FS_IOCTL_MAGIC, 12, __u32) #define F2FS_IOC_SET_ENCRYPTION_POLICY FS_IOC_SET_ENCRYPTION_POLICY #define F2FS_IOC_GET_ENCRYPTION_POLICY FS_IOC_GET_ENCRYPTION_POLICY @@ -339,6 +418,12 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal, #define F2FS_IOC32_GETVERSION FS_IOC32_GETVERSION #endif +struct f2fs_gc_range { + u32 sync; + u64 start; + u64 len; +}; + struct f2fs_defragment { u64 start; u64 len; @@ -351,36 +436,68 @@ struct f2fs_move_range { u64 len; /* size to move */ }; +struct f2fs_flush_device { + u32 dev_num; /* device number to flush */ + u32 segments; /* # of segments to flush */ +}; + +/* for inline stuff */ +#define DEF_INLINE_RESERVED_SIZE 1 +static inline int get_extra_isize(struct inode *inode); +#define MAX_INLINE_DATA(inode) (sizeof(__le32) * \ + (CUR_ADDRS_PER_INODE(inode) - \ + DEF_INLINE_RESERVED_SIZE - \ + F2FS_INLINE_XATTR_ADDRS)) + +/* for inline dir */ +#define NR_INLINE_DENTRY(inode) (MAX_INLINE_DATA(inode) * BITS_PER_BYTE / \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + BITS_PER_BYTE + 1)) +#define INLINE_DENTRY_BITMAP_SIZE(inode) ((NR_INLINE_DENTRY(inode) + \ + BITS_PER_BYTE - 1) / BITS_PER_BYTE) +#define INLINE_RESERVED_SIZE(inode) (MAX_INLINE_DATA(inode) - \ + ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ + NR_INLINE_DENTRY(inode) + \ + INLINE_DENTRY_BITMAP_SIZE(inode))) + /* * For INODE and NODE manager */ /* for directory operations */ struct f2fs_dentry_ptr { struct inode *inode; - const void *bitmap; + void *bitmap; struct f2fs_dir_entry *dentry; __u8 (*filename)[F2FS_SLOT_LEN]; int max; + int nr_bitmap; }; -static inline void make_dentry_ptr(struct inode *inode, - struct f2fs_dentry_ptr *d, void *src, int type) +static inline void make_dentry_ptr_block(struct inode *inode, + struct f2fs_dentry_ptr *d, struct f2fs_dentry_block *t) { d->inode = inode; + d->max = NR_DENTRY_IN_BLOCK; + d->nr_bitmap = SIZE_OF_DENTRY_BITMAP; + d->bitmap = &t->dentry_bitmap; + d->dentry = t->dentry; + d->filename = t->filename; +} - if (type == 1) { - struct f2fs_dentry_block *t = (struct f2fs_dentry_block *)src; - d->max = NR_DENTRY_IN_BLOCK; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } else { - struct f2fs_inline_dentry *t = (struct f2fs_inline_dentry *)src; - d->max = NR_INLINE_DENTRY; - d->bitmap = &t->dentry_bitmap; - d->dentry = t->dentry; - d->filename = t->filename; - } +static inline void make_dentry_ptr_inline(struct inode *inode, + struct f2fs_dentry_ptr *d, void *t) +{ + int entry_cnt = NR_INLINE_DENTRY(inode); + int bitmap_size = INLINE_DENTRY_BITMAP_SIZE(inode); + int reserved_size = INLINE_RESERVED_SIZE(inode); + + d->inode = inode; + d->max = entry_cnt; + d->nr_bitmap = bitmap_size; + d->bitmap = t; + d->dentry = t + bitmap_size + reserved_size; + d->filename = t + bitmap_size + reserved_size + + SIZE_OF_DIR_ENTRY * entry_cnt; } /* @@ -412,16 +529,30 @@ enum { /* number of extent info in extent cache we try to shrink */ #define EXTENT_CACHE_SHRINK_NUMBER 128 +struct rb_entry { + struct rb_node rb_node; /* rb node located in rb-tree */ + unsigned int ofs; /* start offset of the entry */ + unsigned int len; /* length of the entry */ +}; + struct extent_info { unsigned int fofs; /* start offset in a file */ - u32 blk; /* start block address of the extent */ unsigned int len; /* length of the extent */ + u32 blk; /* start block address of the extent */ }; struct extent_node { - struct rb_node rb_node; /* rb node located in rb-tree */ + struct rb_node rb_node; + union { + struct { + unsigned int fofs; + unsigned int len; + u32 blk; + }; + struct extent_info ei; /* extent info */ + + }; struct list_head list; /* node in global extent list of sbi */ - struct extent_info ei; /* extent info */ struct extent_tree *et; /* extent tree pointer */ }; @@ -455,12 +586,13 @@ struct f2fs_map_blocks { }; /* for flag in get_data_block */ -#define F2FS_GET_BLOCK_READ 0 -#define F2FS_GET_BLOCK_DIO 1 -#define F2FS_GET_BLOCK_FIEMAP 2 -#define F2FS_GET_BLOCK_BMAP 3 -#define F2FS_GET_BLOCK_PRE_DIO 4 -#define F2FS_GET_BLOCK_PRE_AIO 5 +enum { + F2FS_GET_BLOCK_DEFAULT, + F2FS_GET_BLOCK_FIEMAP, + F2FS_GET_BLOCK_BMAP, + F2FS_GET_BLOCK_PRE_DIO, + F2FS_GET_BLOCK_PRE_AIO, +}; /* * i_advise uses FADVISE_XXX_BIT. We can add additional hints later. @@ -502,16 +634,29 @@ struct f2fs_inode_info { atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ unsigned int clevel; /* maximum level of given file name */ + struct task_struct *task; /* lookup and create consistency */ + struct task_struct *cp_task; /* separate cp/wb IO stats*/ nid_t i_xattr_nid; /* node id that contains xattrs */ - unsigned long long xattr_ver; /* cp version of xattr modification */ loff_t last_disk_size; /* lastly written file size */ +#ifdef CONFIG_QUOTA + struct dquot *i_dquot[MAXQUOTAS]; + + /* quota space reservation, managed internally by quota code */ + qsize_t i_reserved_quota; +#endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ struct list_head inmem_pages; /* inmemory pages managed by f2fs */ + struct task_struct *inmem_task; /* store inmemory task */ struct mutex inmem_lock; /* lock for inmemory pages */ struct extent_tree *extent_tree; /* cached extent_tree entry */ struct rw_semaphore dio_rwsem[2];/* avoid racing between dio and gc */ + struct rw_semaphore i_mmap_sem; + struct rw_semaphore i_xattr_sem; /* avoid racing between reading and changing EAs */ + + int i_extra_isize; /* size of extra space located in i_addr */ + kprojid_t i_projid; /* id for project quota */ }; static inline void get_extent_info(struct extent_info *ext, @@ -538,11 +683,22 @@ static inline void set_extent_info(struct extent_info *ei, unsigned int fofs, ei->len = len; } -static inline bool __is_extent_same(struct extent_info *ei1, - struct extent_info *ei2) +static inline bool __is_discard_mergeable(struct discard_info *back, + struct discard_info *front) { - return (ei1->fofs == ei2->fofs && ei1->blk == ei2->blk && - ei1->len == ei2->len); + return back->lstart + back->len == front->lstart; +} + +static inline bool __is_discard_back_mergeable(struct discard_info *cur, + struct discard_info *back) +{ + return __is_discard_mergeable(back, cur); +} + +static inline bool __is_discard_front_mergeable(struct discard_info *cur, + struct discard_info *front) +{ + return __is_discard_mergeable(cur, front); } static inline bool __is_extent_mergeable(struct extent_info *back, @@ -564,7 +720,7 @@ static inline bool __is_front_mergeable(struct extent_info *cur, return __is_extent_mergeable(cur, front); } -extern void f2fs_mark_inode_dirty_sync(struct inode *, bool); +extern void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync); static inline void __try_update_largest_extent(struct inode *inode, struct extent_tree *et, struct extent_node *en) { @@ -596,6 +752,7 @@ struct f2fs_nm_info { struct list_head nat_entries; /* cached nat entry list (clean) */ unsigned int nat_cnt; /* the # of cached nat entries */ unsigned int dirty_nat_cnt; /* total num of nat entries in set */ + unsigned int nat_blocks; /* # of nat blocks */ /* free node ids management */ struct radix_tree_root free_nid_root;/* root of the free_nid cache */ @@ -603,9 +760,17 @@ struct f2fs_nm_info { unsigned int nid_cnt[MAX_NID_LIST]; /* the number of free node id */ spinlock_t nid_list_lock; /* protect nid lists ops */ struct mutex build_lock; /* lock for build free nids */ + unsigned char (*free_nid_bitmap)[NAT_ENTRY_BITMAP_SIZE]; + unsigned char *nat_block_bitmap; + unsigned short *free_nid_count; /* free nid count of NAT block */ /* for checkpoint */ char *nat_bitmap; /* NAT bitmap pointer */ + + unsigned int nat_bits_blocks; /* # of nat bits blocks */ + unsigned char *nat_bits; /* NAT bits blocks */ + unsigned char *full_nat_bits; /* full NAT pages */ + unsigned char *empty_nat_bits; /* empty NAT pages */ #ifdef CONFIG_F2FS_CHECK_FS char *nat_bitmap_mir; /* NAT bitmap mirror */ #endif @@ -676,7 +841,8 @@ struct flush_cmd { struct flush_cmd_control { struct task_struct *f2fs_issue_flush; /* flush thread */ wait_queue_head_t flush_wait_queue; /* waiting queue for wake-up */ - atomic_t submit_flush; /* # of issued flushes */ + atomic_t issued_flush; /* # of issued flushes */ + atomic_t issing_flush; /* # of issing flushes */ struct llist_head issue_list; /* list for command issue */ struct llist_node *dispatch_list; /* list for command dispatch */ }; @@ -699,12 +865,6 @@ struct f2fs_sm_info { /* a threshold to reclaim prefree segments */ unsigned int rec_prefree_segments; - /* for small discard management */ - struct list_head discard_entry_list; /* 4KB discard entry list */ - struct list_head discard_cmd_list; /* discard cmd list */ - int nr_discards; /* # of discards in the list */ - int max_discards; /* max. discards to be issued */ - /* for batched trimming */ unsigned int trim_sections; /* # of sections to trim */ @@ -713,9 +873,13 @@ struct f2fs_sm_info { unsigned int ipu_policy; /* in-place-update policy */ unsigned int min_ipu_util; /* in-place-update threshold */ unsigned int min_fsync_blocks; /* threshold for fsync */ + unsigned int min_hot_blocks; /* threshold for hot block allocation */ /* for flush command control */ struct flush_cmd_control *fcc_info; + + /* for discard command control */ + struct discard_cmd_control *dcc_info; }; /* @@ -760,29 +924,68 @@ enum page_type { META_FLUSH, INMEM, /* the below types are used by tracepoints only. */ INMEM_DROP, + INMEM_INVALIDATE, INMEM_REVOKE, IPU, OPU, }; +enum temp_type { + HOT = 0, /* must be zero for meta bio */ + WARM, + COLD, + NR_TEMP_TYPE, +}; + +enum need_lock_type { + LOCK_REQ = 0, + LOCK_DONE, + LOCK_RETRY, +}; + +enum iostat_type { + APP_DIRECT_IO, /* app direct IOs */ + APP_BUFFERED_IO, /* app buffered IOs */ + APP_WRITE_IO, /* app write IOs */ + APP_MAPPED_IO, /* app mapped IOs */ + FS_DATA_IO, /* data IOs from kworker/fsync/reclaimer */ + FS_NODE_IO, /* node IOs from kworker/fsync/reclaimer */ + FS_META_IO, /* meta IOs from kworker/reclaimer */ + FS_GC_DATA_IO, /* data IOs from forground gc */ + FS_GC_NODE_IO, /* node IOs from forground gc */ + FS_CP_DATA_IO, /* data IOs from checkpoint */ + FS_CP_NODE_IO, /* node IOs from checkpoint */ + FS_CP_META_IO, /* meta IOs from checkpoint */ + FS_DISCARD, /* discard */ + NR_IO_TYPE, +}; + struct f2fs_io_info { struct f2fs_sb_info *sbi; /* f2fs_sb_info pointer */ enum page_type type; /* contains DATA/NODE/META/META_FLUSH */ + enum temp_type temp; /* contains HOT/WARM/COLD */ int op; /* contains REQ_OP_ */ int op_flags; /* req_flag_bits */ block_t new_blkaddr; /* new block address to be written */ block_t old_blkaddr; /* old block address before Cow */ struct page *page; /* page to be written */ struct page *encrypted_page; /* encrypted page */ + struct list_head list; /* serialize IOs */ + bool submitted; /* indicate IO submission */ + int need_lock; /* indicate we need to lock cp_rwsem */ + bool in_list; /* indicate fio is in io_list */ + enum iostat_type io_type; /* io type */ }; -#define is_read_io(rw) (rw == READ) +#define is_read_io(rw) ((rw) == READ) struct f2fs_bio_info { struct f2fs_sb_info *sbi; /* f2fs superblock */ struct bio *bio; /* bios to merge */ sector_t last_block_in_bio; /* last block number */ struct f2fs_io_info fio; /* store buffered io info. */ struct rw_semaphore io_rwsem; /* blocking op for bio */ + spinlock_t io_lock; /* serialize DATA/NODE IOs */ + struct list_head io_list; /* track fios */ }; #define FDEV(i) (sbi->devs[i]) @@ -830,10 +1033,6 @@ enum { MAX_TIME, }; -#ifdef CONFIG_F2FS_FS_ENCRYPTION -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 -#endif struct f2fs_sb_info { struct super_block *sb; /* pointer to VFS super block */ struct proc_dir_entry *s_proc; /* proc entry */ @@ -841,11 +1040,6 @@ struct f2fs_sb_info { int valid_super_block; /* valid super block no */ unsigned long s_flag; /* flags for sbi */ -#ifdef CONFIG_F2FS_FS_ENCRYPTION - u8 key_prefix[F2FS_KEY_DESC_PREFIX_SIZE]; - u8 key_prefix_size; -#endif - #ifdef CONFIG_BLK_DEV_ZONED unsigned int blocks_per_blkz; /* F2FS blocks per zone */ unsigned int log_blocks_per_blkz; /* log2 F2FS blocks per zone */ @@ -859,9 +1053,9 @@ struct f2fs_sb_info { struct f2fs_sm_info *sm_info; /* segment manager */ /* for bio operations */ - struct f2fs_bio_info read_io; /* for read bios */ - struct f2fs_bio_info write_io[NR_PAGE_TYPE]; /* for write bios */ - struct mutex wio_mutex[NODE + 1]; /* bio ordering for NODE/DATA */ + struct f2fs_bio_info *write_io[NR_PAGE_TYPE]; /* for write bios */ + struct mutex wio_mutex[NR_PAGE_TYPE - 1][NR_TEMP_TYPE]; + /* bio ordering for NODE/DATA */ int write_io_size_bits; /* Write IO size bits */ mempool_t *write_io_dummy; /* Dummy pages */ @@ -873,6 +1067,7 @@ struct f2fs_sb_info { struct mutex cp_mutex; /* checkpoint procedure lock */ struct rw_semaphore cp_rwsem; /* blocking FS operations */ struct rw_semaphore node_write; /* locking node writes */ + struct rw_semaphore node_change; /* locking node change */ wait_queue_head_t cp_wait; unsigned long last_time[MAX_TIME]; /* to store time in jiffies */ long interval_time[MAX_TIME]; /* to store thresholds */ @@ -888,7 +1083,7 @@ struct f2fs_sb_info { /* for extent tree cache */ struct radix_tree_root extent_tree_root;/* cache extent cache entries */ - struct rw_semaphore extent_tree_lock; /* locking extent radix tree */ + struct mutex extent_tree_lock; /* locking extent radix tree */ struct list_head extent_list; /* lru list for shrinker */ spinlock_t extent_lock; /* locking extent lru list */ atomic_t total_ext_tree; /* extent tree count */ @@ -918,6 +1113,8 @@ struct f2fs_sb_info { block_t total_valid_block_count; /* # of valid blocks */ block_t discard_blks; /* discard command candidats */ block_t last_valid_block_count; /* for recovery */ + block_t reserved_blocks; /* configurable reserved blocks */ + u32 s_next_generation; /* for NFS support */ /* # of pages, see count_type */ @@ -925,6 +1122,9 @@ struct f2fs_sb_info { /* # of allocated blocks */ struct percpu_counter alloc_valid_block_count; + /* writeback control */ + atomic_t wb_sync_req; /* count # of WB_SYNC threads */ + /* valid inode count */ struct percpu_counter total_valid_inode_count; @@ -935,6 +1135,9 @@ struct f2fs_sb_info { struct f2fs_gc_kthread *gc_thread; /* GC thread */ unsigned int cur_victim_sec; /* current victim section num */ + /* threshold for converting bg victims for fg */ + u64 fggc_threshold; + /* maximum # of trials to find a victim segment for SSR and GC */ unsigned int max_victim_search; @@ -955,13 +1158,19 @@ struct f2fs_sb_info { atomic_t inline_inode; /* # of inline_data inodes */ atomic_t inline_dir; /* # of inline_dentry inodes */ atomic_t aw_cnt; /* # of atomic writes */ + atomic_t vw_cnt; /* # of volatile writes */ atomic_t max_aw_cnt; /* max # of atomic writes */ + atomic_t max_vw_cnt; /* max # of volatile writes */ int bg_gc; /* background gc calls */ unsigned int ndirty_inode[NR_INODE_TYPE]; /* # of dirty inodes */ #endif - unsigned int last_victim[2]; /* last victim segment # */ spinlock_t stat_lock; /* lock for stat operations */ + /* For app/fs IO statistics */ + spinlock_t iostat_lock; + unsigned long long write_iostat[NR_IO_TYPE]; + bool iostat_enable; + /* For sysfs suppport */ struct kobject s_kobj; struct completion s_kobj_unregister; @@ -980,13 +1189,26 @@ struct f2fs_sb_info { /* Reference to checksum algorithm driver via cryptoapi */ struct crypto_shash *s_chksum_driver; + /* Precomputed FS UUID checksum for seeding other checksums */ + __u32 s_chksum_seed; + /* For fault injection */ #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info fault_info; #endif + +#ifdef CONFIG_QUOTA + /* Names of quota files with journalled quota */ + char *s_qf_names[MAXQUOTAS]; + int s_jquota_fmt; /* Format of quota to use */ +#endif }; #ifdef CONFIG_F2FS_FAULT_INJECTION +#define f2fs_show_injection_info(type) \ + printk("%sF2FS-fs : inject %s in %s of %pF\n", \ + KERN_INFO, fault_name[type], \ + __func__, __builtin_return_address(0)) static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) { struct f2fs_fault_info *ffi = &sbi->fault_info; @@ -1000,10 +1222,6 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); - printk("%sF2FS-fs : inject %s in %pF\n", - KERN_INFO, - fault_name[type], - __builtin_return_address(0)); return true; } return false; @@ -1014,8 +1232,8 @@ static inline bool time_to_inject(struct f2fs_sb_info *sbi, int type) * and the return value is in kbytes. s is of struct f2fs_sb_info. */ #define BD_PART_WRITTEN(s) \ -(((u64)part_stat_read(s->sb->s_bdev->bd_part, sectors[1]) - \ - s->sectors_written_start) >> 1) +(((u64)part_stat_read((s)->sb->s_bdev->bd_part, sectors[1]) - \ + (s)->sectors_written_start) >> 1) static inline void f2fs_update_time(struct f2fs_sb_info *sbi, int type) { @@ -1068,6 +1286,27 @@ static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, return f2fs_crc32(sbi, buf, buf_size) == blk_crc; } +static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, + const void *address, unsigned int length) +{ + struct { + struct shash_desc shash; + char ctx[4]; + } desc; + int err; + + BUG_ON(crypto_shash_descsize(sbi->s_chksum_driver) != sizeof(desc.ctx)); + + desc.shash.tfm = sbi->s_chksum_driver; + desc.shash.flags = 0; + *(u32 *)desc.ctx = crc; + + err = crypto_shash_update(&desc.shash, address, length); + BUG_ON(err); + + return *(u32 *)desc.ctx; +} + static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) { return container_of(inode, struct f2fs_inode_info, vfs_inode); @@ -1168,6 +1407,12 @@ static inline unsigned long long cur_cp_version(struct f2fs_checkpoint *cp) return le64_to_cpu(cp->checkpoint_ver); } +static inline __u64 cur_cp_crc(struct f2fs_checkpoint *cp) +{ + size_t crc_offset = le32_to_cpu(cp->checksum_offset); + return le32_to_cpu(*((__le32 *)((unsigned char *)cp + crc_offset))); +} + static inline bool __is_set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) { unsigned int ckpt_flags = le32_to_cpu(cp->ckpt_flags); @@ -1191,9 +1436,11 @@ static inline void __set_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) static inline void set_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __set_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); } static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f) @@ -1207,9 +1454,34 @@ static inline void __clear_ckpt_flags(struct f2fs_checkpoint *cp, unsigned int f static inline void clear_ckpt_flags(struct f2fs_sb_info *sbi, unsigned int f) { - spin_lock(&sbi->cp_lock); + unsigned long flags; + + spin_lock_irqsave(&sbi->cp_lock, flags); __clear_ckpt_flags(F2FS_CKPT(sbi), f); - spin_unlock(&sbi->cp_lock); + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + + set_sbi_flag(sbi, SBI_NEED_FSCK); + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + kfree(NM_I(sbi)->nat_bits); + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; } static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) @@ -1217,6 +1489,11 @@ static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) down_read(&sbi->cp_rwsem); } +static inline int f2fs_trylock_op(struct f2fs_sb_info *sbi) +{ + return down_read_trylock(&sbi->cp_rwsem); +} + static inline void f2fs_unlock_op(struct f2fs_sb_info *sbi) { up_read(&sbi->cp_rwsem); @@ -1245,7 +1522,7 @@ static inline int __get_cp_reason(struct f2fs_sb_info *sbi) static inline bool __remain_node_summaries(int reason) { - return (reason == CP_UMOUNT || reason == CP_FASTBOOT); + return (reason & (CP_UMOUNT | CP_FASTBOOT)); } static inline bool __exist_node_summaries(struct f2fs_sb_info *sbi) @@ -1266,17 +1543,14 @@ static inline int check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) return 0; } -#define F2FS_DEFAULT_ALLOCATED_BLOCKS 1 - /* * Check whether the inode has blocks or not */ static inline int F2FS_HAS_BLOCKS(struct inode *inode) { - if (F2FS_I(inode)->i_xattr_nid) - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS + 1; - else - return inode->i_blocks > F2FS_DEFAULT_ALLOCATED_BLOCKS; + block_t xattr_block = F2FS_I(inode)->i_xattr_nid ? 1 : 0; + + return (inode->i_blocks >> F2FS_LOG_SECTORS_PER_BLOCK) > xattr_block; } static inline bool f2fs_has_xattr_block(unsigned int ofs) @@ -1284,15 +1558,24 @@ static inline bool f2fs_has_xattr_block(unsigned int ofs) return ofs == XATTR_NODE_OFFSET; } -static inline void f2fs_i_blocks_write(struct inode *, blkcnt_t, bool); -static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, +static inline void f2fs_i_blocks_write(struct inode *, block_t, bool, bool); +static inline int inc_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, blkcnt_t *count) { - blkcnt_t diff; + blkcnt_t diff = 0, release = 0; + block_t avail_user_block_count; + int ret; + + ret = dquot_reserve_block(inode, *count); + if (ret) + return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_BLOCK)) - return false; + if (time_to_inject(sbi, FAULT_BLOCK)) { + f2fs_show_injection_info(FAULT_BLOCK); + release = *count; + goto enospc; + } #endif /* * let's increase this in prior to actual block count change in order @@ -1302,32 +1585,42 @@ static inline bool inc_valid_block_count(struct f2fs_sb_info *sbi, spin_lock(&sbi->stat_lock); sbi->total_valid_block_count += (block_t)(*count); - if (unlikely(sbi->total_valid_block_count > sbi->user_block_count)) { - diff = sbi->total_valid_block_count - sbi->user_block_count; + avail_user_block_count = sbi->user_block_count - sbi->reserved_blocks; + if (unlikely(sbi->total_valid_block_count > avail_user_block_count)) { + diff = sbi->total_valid_block_count - avail_user_block_count; *count -= diff; - sbi->total_valid_block_count = sbi->user_block_count; + release = diff; + sbi->total_valid_block_count = avail_user_block_count; if (!*count) { spin_unlock(&sbi->stat_lock); percpu_counter_sub(&sbi->alloc_valid_block_count, diff); - return false; + goto enospc; } } spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, *count, true); - return true; + if (release) + dquot_release_reservation_block(inode, release); + f2fs_i_blocks_write(inode, *count, true, true); + return 0; + +enospc: + dquot_release_reservation_block(inode, release); + return -ENOSPC; } static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, struct inode *inode, - blkcnt_t count) + block_t count) { + blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; + spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - f2fs_bug_on(sbi, inode->i_blocks < count); + f2fs_bug_on(sbi, inode->i_blocks < sectors); sbi->total_valid_block_count -= (block_t)count; spin_unlock(&sbi->stat_lock); - f2fs_i_blocks_write(inode, count, false); + f2fs_i_blocks_write(inode, count, false, true); } static inline void inc_page_count(struct f2fs_sb_info *sbi, int count_type) @@ -1456,51 +1749,70 @@ static inline block_t __start_sum_addr(struct f2fs_sb_info *sbi) return le32_to_cpu(F2FS_CKPT(sbi)->cp_pack_start_sum); } -static inline bool inc_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) +static inline int inc_valid_node_count(struct f2fs_sb_info *sbi, + struct inode *inode, bool is_inode) { block_t valid_block_count; unsigned int valid_node_count; + bool quota = inode && !is_inode; + + if (quota) { + int ret = dquot_reserve_block(inode, 1); + if (ret) + return ret; + } spin_lock(&sbi->stat_lock); valid_block_count = sbi->total_valid_block_count + 1; - if (unlikely(valid_block_count > sbi->user_block_count)) { + if (unlikely(valid_block_count + sbi->reserved_blocks > + sbi->user_block_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } valid_node_count = sbi->total_valid_node_count + 1; if (unlikely(valid_node_count > sbi->total_node_count)) { spin_unlock(&sbi->stat_lock); - return false; + goto enospc; } - if (inode) - f2fs_i_blocks_write(inode, 1, true); - sbi->total_valid_node_count++; sbi->total_valid_block_count++; spin_unlock(&sbi->stat_lock); + if (inode) { + if (is_inode) + f2fs_mark_inode_dirty_sync(inode, true); + else + f2fs_i_blocks_write(inode, 1, true, true); + } + percpu_counter_inc(&sbi->alloc_valid_block_count); - return true; + return 0; + +enospc: + if (quota) + dquot_release_reservation_block(inode, 1); + return -ENOSPC; } static inline void dec_valid_node_count(struct f2fs_sb_info *sbi, - struct inode *inode) + struct inode *inode, bool is_inode) { spin_lock(&sbi->stat_lock); f2fs_bug_on(sbi, !sbi->total_valid_block_count); f2fs_bug_on(sbi, !sbi->total_valid_node_count); - f2fs_bug_on(sbi, !inode->i_blocks); + f2fs_bug_on(sbi, !is_inode && !inode->i_blocks); - f2fs_i_blocks_write(inode, 1, false); sbi->total_valid_node_count--; sbi->total_valid_block_count--; spin_unlock(&sbi->stat_lock); + + if (!is_inode) + f2fs_i_blocks_write(inode, 1, false, true); } static inline unsigned int valid_node_count(struct f2fs_sb_info *sbi) @@ -1528,11 +1840,14 @@ static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, { #ifdef CONFIG_F2FS_FAULT_INJECTION struct page *page = find_lock_page(mapping, index); + if (page) return page; - if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) { + f2fs_show_injection_info(FAULT_PAGE_ALLOC); return NULL; + } #endif if (!for_write) return grab_cache_page(mapping, index); @@ -1611,22 +1926,42 @@ static inline void f2fs_radix_tree_insert(struct radix_tree_root *root, static inline bool IS_INODE(struct page *page) { struct f2fs_node *p = F2FS_NODE(page); + return RAW_IS_INODE(p); } +static inline int offset_in_addr(struct f2fs_inode *i) +{ + return (i->i_inline & F2FS_EXTRA_ATTR) ? + (le16_to_cpu(i->i_extra_isize) / sizeof(__le32)) : 0; +} + static inline __le32 *blkaddr_in_node(struct f2fs_node *node) { return RAW_IS_INODE(node) ? node->i.i_addr : node->dn.addr; } -static inline block_t datablock_addr(struct page *node_page, - unsigned int offset) +static inline int f2fs_has_extra_attr(struct inode *inode); +static inline block_t datablock_addr(struct inode *inode, + struct page *node_page, unsigned int offset) { struct f2fs_node *raw_node; __le32 *addr_array; + int base = 0; + bool is_inode = IS_INODE(node_page); + raw_node = F2FS_NODE(node_page); + + /* from GC path only */ + if (!inode) { + if (is_inode) + base = offset_in_addr(&raw_node->i); + } else if (f2fs_has_extra_attr(inode) && is_inode) { + base = get_extra_isize(inode); + } + addr_array = blkaddr_in_node(raw_node); - return le32_to_cpu(addr_array[offset]); + return le32_to_cpu(addr_array[base + offset]); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -1689,6 +2024,20 @@ static inline void f2fs_change_bit(unsigned int nr, char *addr) *addr ^= mask; } +#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) +#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) +#define F2FS_FL_INHERITED (FS_PROJINHERIT_FL) + +static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +{ + if (S_ISDIR(mode)) + return flags; + else if (S_ISREG(mode)) + return flags & F2FS_REG_FLMASK; + else + return flags & F2FS_OTHER_FLMASK; +} + /* used for f2fs_inode_info->flags */ enum { FI_NEW_INODE, /* indicate newly allocated inode */ @@ -1715,6 +2064,10 @@ enum { FI_INLINE_DOTS, /* indicate inline dot dentries */ FI_DO_DEFRAG, /* indicate defragment is running */ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */ + FI_NO_PREALLOC, /* indicate skipped preallocated blocks */ + FI_HOT_DATA, /* indicate file is hot */ + FI_EXTRA_ATTR, /* indicate file has extra attribute */ + FI_PROJ_INHERIT, /* indicate file inherits projectid */ }; static inline void __mark_inode_dirty_flag(struct inode *inode, @@ -1768,13 +2121,21 @@ static inline void f2fs_i_links_write(struct inode *inode, bool inc) } static inline void f2fs_i_blocks_write(struct inode *inode, - blkcnt_t diff, bool add) + block_t diff, bool add, bool claim) { bool clean = !is_inode_flag_set(inode, FI_DIRTY_INODE); bool recover = is_inode_flag_set(inode, FI_AUTO_RECOVER); - inode->i_blocks = add ? inode->i_blocks + diff : - inode->i_blocks - diff; + /* add = 1, claim = 1 should be dquot_reserve_block in pair */ + if (add) { + if (claim) + dquot_claim_block(inode, diff); + else + dquot_alloc_block_nofail(inode, diff); + } else { + dquot_free_block(inode, diff); + } + f2fs_mark_inode_dirty_sync(inode, true); if (clean || recover) set_inode_flag(inode, FI_AUTO_RECOVER); @@ -1826,6 +2187,8 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri) set_bit(FI_DATA_EXIST, &fi->flags); if (ri->i_inline & F2FS_INLINE_DOTS) set_bit(FI_INLINE_DOTS, &fi->flags); + if (ri->i_inline & F2FS_EXTRA_ATTR) + set_bit(FI_EXTRA_ATTR, &fi->flags); } static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) @@ -1842,6 +2205,13 @@ static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri) ri->i_inline |= F2FS_DATA_EXIST; if (is_inode_flag_set(inode, FI_INLINE_DOTS)) ri->i_inline |= F2FS_INLINE_DOTS; + if (is_inode_flag_set(inode, FI_EXTRA_ATTR)) + ri->i_inline |= F2FS_EXTRA_ATTR; +} + +static inline int f2fs_has_extra_attr(struct inode *inode) +{ + return is_inode_flag_set(inode, FI_EXTRA_ATTR); } static inline int f2fs_has_inline_xattr(struct inode *inode) @@ -1852,13 +2222,14 @@ static inline int f2fs_has_inline_xattr(struct inode *inode) static inline unsigned int addrs_per_inode(struct inode *inode) { if (f2fs_has_inline_xattr(inode)) - return DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS; - return DEF_ADDRS_PER_INODE; + return CUR_ADDRS_PER_INODE(inode) - F2FS_INLINE_XATTR_ADDRS; + return CUR_ADDRS_PER_INODE(inode); } static inline void *inline_xattr_addr(struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); + return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS]); } @@ -1876,12 +2247,6 @@ static inline int f2fs_has_inline_data(struct inode *inode) return is_inode_flag_set(inode, FI_INLINE_DATA); } -static inline void f2fs_clear_inline_inode(struct inode *inode) -{ - clear_inode_flag(inode, FI_INLINE_DATA); - clear_inode_flag(inode, FI_DATA_EXIST); -} - static inline int f2fs_exist_data(struct inode *inode) { return is_inode_flag_set(inode, FI_DATA_EXIST); @@ -1917,10 +2282,12 @@ static inline bool f2fs_is_drop_cache(struct inode *inode) return is_inode_flag_set(inode, FI_DROP_CACHE); } -static inline void *inline_data_addr(struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct page *page) { struct f2fs_inode *ri = F2FS_INODE(page); - return (void *)&(ri->i_addr[1]); + int extra_size = get_extra_isize(inode); + + return (void *)&(ri->i_addr[extra_size + DEF_INLINE_RESERVED_SIZE]); } static inline int f2fs_has_inline_dentry(struct inode *inode) @@ -2003,13 +2370,15 @@ static inline void *f2fs_kmalloc(struct f2fs_sb_info *sbi, size_t size, gfp_t flags) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_KMALLOC)) + if (time_to_inject(sbi, FAULT_KMALLOC)) { + f2fs_show_injection_info(FAULT_KMALLOC); return NULL; + } #endif return kmalloc(size, flags); } -static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) +static inline void *kvmalloc(size_t size, gfp_t flags) { void *ret; @@ -2019,7 +2388,7 @@ static inline void *f2fs_kvmalloc(size_t size, gfp_t flags) return ret; } -static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) +static inline void *kvzalloc(size_t size, gfp_t flags) { void *ret; @@ -2029,42 +2398,79 @@ static inline void *f2fs_kvzalloc(size_t size, gfp_t flags) return ret; } +static inline int get_extra_isize(struct inode *inode) +{ + return F2FS_I(inode)->i_extra_isize / sizeof(__le32); +} + #define get_inode_mode(i) \ ((is_inode_flag_set(i, FI_ACL_MODE)) ? \ (F2FS_I(i)->i_acl_mode) : ((i)->i_mode)) -/* get offset of first page in next direct node */ -#define PGOFS_OF_NEXT_DNODE(pgofs, inode) \ - ((pgofs < ADDRS_PER_INODE(inode)) ? ADDRS_PER_INODE(inode) : \ - (pgofs - ADDRS_PER_INODE(inode) + ADDRS_PER_BLOCK) / \ - ADDRS_PER_BLOCK * ADDRS_PER_BLOCK + ADDRS_PER_INODE(inode)) +#define F2FS_TOTAL_EXTRA_ATTR_SIZE \ + (offsetof(struct f2fs_inode, i_extra_end) - \ + offsetof(struct f2fs_inode, i_extra_isize)) \ + +#define F2FS_OLD_ATTRIBUTE_SIZE (offsetof(struct f2fs_inode, i_addr)) +#define F2FS_FITS_IN_INODE(f2fs_inode, extra_isize, field) \ + ((offsetof(typeof(*f2fs_inode), field) + \ + sizeof((f2fs_inode)->field)) \ + <= (F2FS_OLD_ATTRIBUTE_SIZE + extra_isize)) \ + +static inline void f2fs_reset_iostat(struct f2fs_sb_info *sbi) +{ + int i; + + spin_lock(&sbi->iostat_lock); + for (i = 0; i < NR_IO_TYPE; i++) + sbi->write_iostat[i] = 0; + spin_unlock(&sbi->iostat_lock); +} + +static inline void f2fs_update_iostat(struct f2fs_sb_info *sbi, + enum iostat_type type, unsigned long long io_bytes) +{ + if (!sbi->iostat_enable) + return; + spin_lock(&sbi->iostat_lock); + sbi->write_iostat[type] += io_bytes; + + if (type == APP_WRITE_IO || type == APP_DIRECT_IO) + sbi->write_iostat[APP_BUFFERED_IO] = + sbi->write_iostat[APP_WRITE_IO] - + sbi->write_iostat[APP_DIRECT_IO]; + spin_unlock(&sbi->iostat_lock); +} /* * file.c */ -int f2fs_sync_file(struct file *, loff_t, loff_t, int); -void truncate_data_blocks(struct dnode_of_data *); -int truncate_blocks(struct inode *, u64, bool); -int f2fs_truncate(struct inode *); -int f2fs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -int f2fs_setattr(struct dentry *, struct iattr *); -int truncate_hole(struct inode *, pgoff_t, pgoff_t); -int truncate_data_blocks_range(struct dnode_of_data *, int); -long f2fs_ioctl(struct file *, unsigned int, unsigned long); -long f2fs_compat_ioctl(struct file *, unsigned int, unsigned long); +int f2fs_sync_file(struct file *file, loff_t start, loff_t end, int datasync); +void truncate_data_blocks(struct dnode_of_data *dn); +int truncate_blocks(struct inode *inode, u64 from, bool lock); +int f2fs_truncate(struct inode *inode); +int f2fs_getattr(struct vfsmount *mnt, struct dentry *dentry, + struct kstat *stat); +int f2fs_setattr(struct dentry *dentry, struct iattr *attr); +int truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end); +int truncate_data_blocks_range(struct dnode_of_data *dn, int count); +long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg); +long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); /* * inode.c */ -void f2fs_set_inode_flags(struct inode *); -struct inode *f2fs_iget(struct super_block *, unsigned long); -struct inode *f2fs_iget_retry(struct super_block *, unsigned long); -int try_to_free_nats(struct f2fs_sb_info *, int); -int update_inode(struct inode *, struct page *); -int update_inode_page(struct inode *); -int f2fs_write_inode(struct inode *, struct writeback_control *); -void f2fs_evict_inode(struct inode *); -void handle_failed_inode(struct inode *); +void f2fs_set_inode_flags(struct inode *inode); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); +struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); +struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); +int try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); +int update_inode(struct inode *inode, struct page *node_page); +int update_inode_page(struct inode *inode); +int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); +void f2fs_evict_inode(struct inode *inode); +void handle_failed_inode(struct inode *inode); /* * namei.c @@ -2074,40 +2480,45 @@ struct dentry *f2fs_get_parent(struct dentry *child); /* * dir.c */ -void set_de_type(struct f2fs_dir_entry *, umode_t); -unsigned char get_de_type(struct f2fs_dir_entry *); -struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *, - f2fs_hash_t, int *, struct f2fs_dentry_ptr *); -int f2fs_fill_dentries(struct dir_context *, struct f2fs_dentry_ptr *, - unsigned int, struct fscrypt_str *); -void do_make_empty_dir(struct inode *, struct inode *, - struct f2fs_dentry_ptr *); -struct page *init_inode_metadata(struct inode *, struct inode *, - const struct qstr *, const struct qstr *, struct page *); -void update_parent_metadata(struct inode *, struct inode *, unsigned int); -int room_for_filename(const void *, int, int); -void f2fs_drop_nlink(struct inode *, struct inode *); -struct f2fs_dir_entry *__f2fs_find_entry(struct inode *, struct fscrypt_name *, - struct page **); -struct f2fs_dir_entry *f2fs_find_entry(struct inode *, const struct qstr *, - struct page **); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *, struct page **); -ino_t f2fs_inode_by_name(struct inode *, const struct qstr *, struct page **); -void f2fs_set_link(struct inode *, struct f2fs_dir_entry *, - struct page *, struct inode *); -int update_dent_inode(struct inode *, struct inode *, const struct qstr *); -void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *, - const struct qstr *, f2fs_hash_t , unsigned int); -int f2fs_add_regular_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -int __f2fs_do_add_link(struct inode *, struct fscrypt_name*, struct inode *, - nid_t, umode_t); -int __f2fs_add_link(struct inode *, const struct qstr *, struct inode *, nid_t, - umode_t); -void f2fs_delete_entry(struct f2fs_dir_entry *, struct page *, struct inode *, - struct inode *); -int f2fs_do_tmpfile(struct inode *, struct inode *); -bool f2fs_empty_dir(struct inode *); +void set_de_type(struct f2fs_dir_entry *de, umode_t mode); +unsigned char get_de_type(struct f2fs_dir_entry *de); +struct f2fs_dir_entry *find_target_dentry(struct fscrypt_name *fname, + f2fs_hash_t namehash, int *max_slots, + struct f2fs_dentry_ptr *d); +int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, + unsigned int start_pos, struct fscrypt_str *fstr); +void do_make_empty_dir(struct inode *inode, struct inode *parent, + struct f2fs_dentry_ptr *d); +struct page *init_inode_metadata(struct inode *inode, struct inode *dir, + const struct qstr *new_name, + const struct qstr *orig_name, struct page *dpage); +void update_parent_metadata(struct inode *dir, struct inode *inode, + unsigned int current_depth); +int room_for_filename(const void *bitmap, int slots, int max_slots); +void f2fs_drop_nlink(struct inode *dir, struct inode *inode); +struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, + const struct qstr *child, struct page **res_page); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); +ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, + struct page **page); +void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, + struct page *page, struct inode *inode); +void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, + const struct qstr *name, f2fs_hash_t name_hash, + unsigned int bit_pos); +int f2fs_add_regular_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_do_add_link(struct inode *dir, struct fscrypt_name *fname, + struct inode *inode, nid_t ino, umode_t mode); +int __f2fs_add_link(struct inode *dir, const struct qstr *name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +int f2fs_do_tmpfile(struct inode *inode, struct inode *dir); +bool f2fs_empty_dir(struct inode *dir); static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) { @@ -2118,18 +2529,21 @@ static inline int f2fs_add_link(struct dentry *dentry, struct inode *inode) /* * super.c */ -int f2fs_inode_dirtied(struct inode *, bool); -void f2fs_inode_synced(struct inode *); -int f2fs_commit_super(struct f2fs_sb_info *, bool); -int f2fs_sync_fs(struct super_block *, int); +int f2fs_inode_dirtied(struct inode *inode, bool sync); +void f2fs_inode_synced(struct inode *inode); +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi); +void f2fs_quota_off_umount(struct super_block *sb); +int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover); +int f2fs_sync_fs(struct super_block *sb, int sync); extern __printf(3, 4) -void f2fs_msg(struct super_block *, const char *, const char *, ...); +void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...); int sanity_check_ckpt(struct f2fs_sb_info *sbi); /* * hash.c */ -f2fs_hash_t f2fs_dentry_hash(const struct qstr *); +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname); /* * node.c @@ -2137,164 +2551,190 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *); struct dnode_of_data; struct node_info; -bool available_free_memory(struct f2fs_sb_info *, int); -int need_dentry_mark(struct f2fs_sb_info *, nid_t); -bool is_checkpointed_node(struct f2fs_sb_info *, nid_t); -bool need_inode_block_update(struct f2fs_sb_info *, nid_t); -void get_node_info(struct f2fs_sb_info *, nid_t, struct node_info *); -pgoff_t get_next_page_offset(struct dnode_of_data *, pgoff_t); -int get_dnode_of_data(struct dnode_of_data *, pgoff_t, int); -int truncate_inode_blocks(struct inode *, pgoff_t); -int truncate_xattr_node(struct inode *, struct page *); -int wait_on_node_pages_writeback(struct f2fs_sb_info *, nid_t); -int remove_inode_page(struct inode *); -struct page *new_inode_page(struct inode *); -struct page *new_node_page(struct dnode_of_data *, unsigned int, struct page *); -void ra_node_page(struct f2fs_sb_info *, nid_t); -struct page *get_node_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_node_page_ra(struct page *, int); -void move_node_page(struct page *, int); -int fsync_node_pages(struct f2fs_sb_info *, struct inode *, - struct writeback_control *, bool); -int sync_node_pages(struct f2fs_sb_info *, struct writeback_control *); -void build_free_nids(struct f2fs_sb_info *, bool); -bool alloc_nid(struct f2fs_sb_info *, nid_t *); -void alloc_nid_done(struct f2fs_sb_info *, nid_t); -void alloc_nid_failed(struct f2fs_sb_info *, nid_t); -int try_to_free_nids(struct f2fs_sb_info *, int); -void recover_inline_xattr(struct inode *, struct page *); -void recover_xattr_data(struct inode *, struct page *, block_t); -int recover_inode_page(struct f2fs_sb_info *, struct page *); -int restore_node_summary(struct f2fs_sb_info *, unsigned int, - struct f2fs_summary_block *); -void flush_nat_entries(struct f2fs_sb_info *); -int build_node_manager(struct f2fs_sb_info *); -void destroy_node_manager(struct f2fs_sb_info *); +bool available_free_memory(struct f2fs_sb_info *sbi, int type); +int need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); +bool is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); +bool need_inode_block_update(struct f2fs_sb_info *sbi, nid_t ino); +void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni); +pgoff_t get_next_page_offset(struct dnode_of_data *dn, pgoff_t pgofs); +int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode); +int truncate_inode_blocks(struct inode *inode, pgoff_t from); +int truncate_xattr_node(struct inode *inode, struct page *page); +int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino); +int remove_inode_page(struct inode *inode); +struct page *new_inode_page(struct inode *inode); +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs); +void ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); +struct page *get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct page *get_node_page_ra(struct page *parent, int start); +void move_node_page(struct page *node_page, int gc_type); +int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, + struct writeback_control *wbc, bool atomic); +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type); +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount); +bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); +void alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); +void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); +int try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); +void recover_inline_xattr(struct inode *inode, struct page *page); +int recover_xattr_data(struct inode *inode, struct page *page, + block_t blkaddr); +int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); +int restore_node_summary(struct f2fs_sb_info *sbi, + unsigned int segno, struct f2fs_summary_block *sum); +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_node_manager(struct f2fs_sb_info *sbi); +void destroy_node_manager(struct f2fs_sb_info *sbi); int __init create_node_manager_caches(void); void destroy_node_manager_caches(void); /* * segment.c */ -void register_inmem_page(struct inode *, struct page *); -void drop_inmem_pages(struct inode *); -int commit_inmem_pages(struct inode *); -void f2fs_balance_fs(struct f2fs_sb_info *, bool); -void f2fs_balance_fs_bg(struct f2fs_sb_info *); -int f2fs_issue_flush(struct f2fs_sb_info *); -int create_flush_cmd_control(struct f2fs_sb_info *); -void destroy_flush_cmd_control(struct f2fs_sb_info *, bool); -void invalidate_blocks(struct f2fs_sb_info *, block_t); -bool is_checkpointed_data(struct f2fs_sb_info *, block_t); -void refresh_sit_entry(struct f2fs_sb_info *, block_t, block_t); -void f2fs_wait_discard_bio(struct f2fs_sb_info *, block_t); -void clear_prefree_segments(struct f2fs_sb_info *, struct cp_control *); -void release_discard_addrs(struct f2fs_sb_info *); -int npages_for_summary_flush(struct f2fs_sb_info *, bool); -void allocate_new_segments(struct f2fs_sb_info *); -int f2fs_trim_fs(struct f2fs_sb_info *, struct fstrim_range *); -bool exist_trim_candidates(struct f2fs_sb_info *, struct cp_control *); -struct page *get_sum_page(struct f2fs_sb_info *, unsigned int); -void update_meta_page(struct f2fs_sb_info *, void *, block_t); -void write_meta_page(struct f2fs_sb_info *, struct page *); -void write_node_page(unsigned int, struct f2fs_io_info *); -void write_data_page(struct dnode_of_data *, struct f2fs_io_info *); -void rewrite_data_page(struct f2fs_io_info *); -void __f2fs_replace_block(struct f2fs_sb_info *, struct f2fs_summary *, - block_t, block_t, bool, bool); -void f2fs_replace_block(struct f2fs_sb_info *, struct dnode_of_data *, - block_t, block_t, unsigned char, bool, bool); -void allocate_data_block(struct f2fs_sb_info *, struct page *, - block_t, block_t *, struct f2fs_summary *, int); -void f2fs_wait_on_page_writeback(struct page *, enum page_type, bool); -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *, block_t); -void write_data_summaries(struct f2fs_sb_info *, block_t); -void write_node_summaries(struct f2fs_sb_info *, block_t); -int lookup_journal_in_cursum(struct f2fs_journal *, int, unsigned int, int); -void flush_sit_entries(struct f2fs_sb_info *, struct cp_control *); -int build_segment_manager(struct f2fs_sb_info *); -void destroy_segment_manager(struct f2fs_sb_info *); +bool need_SSR(struct f2fs_sb_info *sbi); +void register_inmem_page(struct inode *inode, struct page *page); +void drop_inmem_pages(struct inode *inode); +void drop_inmem_page(struct inode *inode, struct page *page); +int commit_inmem_pages(struct inode *inode); +void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need); +void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi); +int f2fs_issue_flush(struct f2fs_sb_info *sbi); +int create_flush_cmd_control(struct f2fs_sb_info *sbi); +void destroy_flush_cmd_control(struct f2fs_sb_info *sbi, bool free); +void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); +bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); +void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); +void stop_discard_thread(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void release_discard_addrs(struct f2fs_sb_info *sbi); +int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +void allocate_new_segments(struct f2fs_sb_info *sbi); +int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); +bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); +struct page *get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +void update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type); +void write_node_page(unsigned int nid, struct f2fs_io_info *fio); +void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio); +int rewrite_data_page(struct f2fs_io_info *fio); +void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, + block_t old_blkaddr, block_t new_blkaddr, + bool recover_curseg, bool recover_newaddr); +void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, + block_t old_addr, block_t new_addr, + unsigned char version, bool recover_curseg, + bool recover_newaddr); +void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, + block_t old_blkaddr, block_t *new_blkaddr, + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list); +void f2fs_wait_on_page_writeback(struct page *page, + enum page_type type, bool ordered); +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr); +void write_data_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +void write_node_summaries(struct f2fs_sb_info *sbi, block_t start_blk); +int lookup_journal_in_cursum(struct f2fs_journal *journal, int type, + unsigned int val, int alloc); +void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); +int build_segment_manager(struct f2fs_sb_info *sbi); +void destroy_segment_manager(struct f2fs_sb_info *sbi); int __init create_segment_manager_caches(void); void destroy_segment_manager_caches(void); /* * checkpoint.c */ -void f2fs_stop_checkpoint(struct f2fs_sb_info *, bool); -struct page *grab_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_meta_page(struct f2fs_sb_info *, pgoff_t); -struct page *get_tmp_page(struct f2fs_sb_info *, pgoff_t); -bool is_valid_blkaddr(struct f2fs_sb_info *, block_t, int); -int ra_meta_pages(struct f2fs_sb_info *, block_t, int, int, bool); -void ra_meta_pages_cond(struct f2fs_sb_info *, pgoff_t); -long sync_meta_pages(struct f2fs_sb_info *, enum page_type, long); -void add_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void remove_ino_entry(struct f2fs_sb_info *, nid_t, int type); -void release_ino_entry(struct f2fs_sb_info *, bool); -bool exist_written_data(struct f2fs_sb_info *, nid_t, int); -int f2fs_sync_inode_meta(struct f2fs_sb_info *); -int acquire_orphan_inode(struct f2fs_sb_info *); -void release_orphan_inode(struct f2fs_sb_info *); -void add_orphan_inode(struct inode *); -void remove_orphan_inode(struct f2fs_sb_info *, nid_t); -int recover_orphan_inodes(struct f2fs_sb_info *); -int get_valid_checkpoint(struct f2fs_sb_info *); -void update_dirty_page(struct inode *, struct page *); -void remove_dirty_inode(struct inode *); -int sync_dirty_inodes(struct f2fs_sb_info *, enum inode_type); -int write_checkpoint(struct f2fs_sb_info *, struct cp_control *); -void init_ino_entry_info(struct f2fs_sb_info *); +void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io); +struct page *grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct page *get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +bool is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); +int ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, + int type, bool sync); +void ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index); +long sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, + long nr_to_write, enum iostat_type io_type); +void add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void remove_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, int type); +void release_ino_entry(struct f2fs_sb_info *sbi, bool all); +bool exist_written_data(struct f2fs_sb_info *sbi, nid_t ino, int mode); +int f2fs_sync_inode_meta(struct f2fs_sb_info *sbi); +int acquire_orphan_inode(struct f2fs_sb_info *sbi); +void release_orphan_inode(struct f2fs_sb_info *sbi); +void add_orphan_inode(struct inode *inode); +void remove_orphan_inode(struct f2fs_sb_info *sbi, nid_t ino); +int recover_orphan_inodes(struct f2fs_sb_info *sbi); +int get_valid_checkpoint(struct f2fs_sb_info *sbi); +void update_dirty_page(struct inode *inode, struct page *page); +void remove_dirty_inode(struct inode *inode); +int sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type); +int write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc); +void init_ino_entry_info(struct f2fs_sb_info *sbi); int __init create_checkpoint_caches(void); void destroy_checkpoint_caches(void); /* * data.c */ -void f2fs_submit_merged_bio(struct f2fs_sb_info *, enum page_type, int); -void f2fs_submit_merged_bio_cond(struct f2fs_sb_info *, struct inode *, - struct page *, nid_t, enum page_type, int); -void f2fs_flush_merged_bios(struct f2fs_sb_info *); -int f2fs_submit_page_bio(struct f2fs_io_info *); -int f2fs_submit_page_mbio(struct f2fs_io_info *); -struct block_device *f2fs_target_device(struct f2fs_sb_info *, - block_t, struct bio *); -int f2fs_target_device_index(struct f2fs_sb_info *, block_t); -void set_data_blkaddr(struct dnode_of_data *); -void f2fs_update_data_blkaddr(struct dnode_of_data *, block_t); -int reserve_new_blocks(struct dnode_of_data *, blkcnt_t); -int reserve_new_block(struct dnode_of_data *); -int f2fs_get_block(struct dnode_of_data *, pgoff_t); -int f2fs_preallocate_blocks(struct kiocb *, struct iov_iter *); -int f2fs_reserve_block(struct dnode_of_data *, pgoff_t); -struct page *get_read_data_page(struct inode *, pgoff_t, int, bool); -struct page *find_data_page(struct inode *, pgoff_t); -struct page *get_lock_data_page(struct inode *, pgoff_t, bool); -struct page *get_new_data_page(struct inode *, struct page *, pgoff_t, bool); -int do_write_data_page(struct f2fs_io_info *); -int f2fs_map_blocks(struct inode *, struct f2fs_map_blocks *, int, int); -int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *, u64, u64); -void f2fs_set_page_dirty_nobuffers(struct page *); -void f2fs_invalidate_page(struct page *, unsigned int, unsigned int); -int f2fs_release_page(struct page *, gfp_t); +void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type); +void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, + struct inode *inode, nid_t ino, pgoff_t idx, + enum page_type type); +void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); +int f2fs_submit_page_bio(struct f2fs_io_info *fio); +int f2fs_submit_page_write(struct f2fs_io_info *fio); +struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi, + block_t blk_addr, struct bio *bio); +int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr); +void set_data_blkaddr(struct dnode_of_data *dn); +void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr); +int reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); +int reserve_new_block(struct dnode_of_data *dn); +int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index); +int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from); +int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); +struct page *get_read_data_page(struct inode *inode, pgoff_t index, + int op_flags, bool for_write); +struct page *find_data_page(struct inode *inode, pgoff_t index); +struct page *get_lock_data_page(struct inode *inode, pgoff_t index, + bool for_write); +struct page *get_new_data_page(struct inode *inode, + struct page *ipage, pgoff_t index, bool new_i_size); +int do_write_data_page(struct f2fs_io_info *fio); +int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, + int create, int flag); +int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, + u64 start, u64 len); +void f2fs_set_page_dirty_nobuffers(struct page *page); +int __f2fs_write_data_pages(struct address_space *mapping, + struct writeback_control *wbc, + enum iostat_type io_type); +void f2fs_invalidate_page(struct page *page, unsigned int offset, + unsigned int length); +int f2fs_release_page(struct page *page, gfp_t wait); #ifdef CONFIG_MIGRATION -int f2fs_migrate_page(struct address_space *, struct page *, struct page *, - enum migrate_mode); +int f2fs_migrate_page(struct address_space *mapping, struct page *newpage, + struct page *page, enum migrate_mode mode); #endif /* * gc.c */ -int start_gc_thread(struct f2fs_sb_info *); -void stop_gc_thread(struct f2fs_sb_info *); -block_t start_bidx_of_node(unsigned int, struct inode *); -int f2fs_gc(struct f2fs_sb_info *, bool, bool); -void build_gc_manager(struct f2fs_sb_info *); +int start_gc_thread(struct f2fs_sb_info *sbi); +void stop_gc_thread(struct f2fs_sb_info *sbi); +block_t start_bidx_of_node(unsigned int node_ofs, struct inode *inode); +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, + unsigned int segno); +void build_gc_manager(struct f2fs_sb_info *sbi); /* * recovery.c */ -int recover_fsync_data(struct f2fs_sb_info *, bool); -bool space_for_roll_forward(struct f2fs_sb_info *); +int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only); +bool space_for_roll_forward(struct f2fs_sb_info *sbi); /* * debug.c @@ -2311,11 +2751,15 @@ struct f2fs_stat_info { int ndirty_node, ndirty_dent, ndirty_meta, ndirty_data, ndirty_imeta; int inmem_pages; unsigned int ndirty_dirs, ndirty_files, ndirty_all; - int nats, dirty_nats, sits, dirty_sits, free_nids, alloc_nids; + int nats, dirty_nats, sits, dirty_sits; + int free_nids, avail_nids, alloc_nids; int total_count, utilization; int bg_gc, nr_wb_cp_data, nr_wb_data; - int inline_xattr, inline_inode, inline_dir, orphans; - int aw_cnt, max_aw_cnt; + int nr_flushing, nr_flushed, nr_discarding, nr_discarded; + int nr_discard_cmd; + unsigned int undiscard_blks; + int inline_xattr, inline_inode, inline_dir, append, update, orphans; + int aw_cnt, max_aw_cnt, vw_cnt, max_vw_cnt; unsigned int valid_count, valid_node_count, valid_inode_count, discard_blks; unsigned int bimodal, avg_vblocks; int util_free, util_valid, util_invalid; @@ -2388,9 +2832,9 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) #define stat_inc_inplace_blocks(sbi) \ (atomic_inc(&(sbi)->inplace_count)) #define stat_inc_atomic_write(inode) \ - (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_inc(&F2FS_I_SB(inode)->aw_cnt)) #define stat_dec_atomic_write(inode) \ - (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)); + (atomic_dec(&F2FS_I_SB(inode)->aw_cnt)) #define stat_update_max_atomic_write(inode) \ do { \ int cur = atomic_read(&F2FS_I_SB(inode)->aw_cnt); \ @@ -2398,11 +2842,22 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) if (cur > max) \ atomic_set(&F2FS_I_SB(inode)->max_aw_cnt, cur); \ } while (0) +#define stat_inc_volatile_write(inode) \ + (atomic_inc(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_dec_volatile_write(inode) \ + (atomic_dec(&F2FS_I_SB(inode)->vw_cnt)) +#define stat_update_max_volatile_write(inode) \ + do { \ + int cur = atomic_read(&F2FS_I_SB(inode)->vw_cnt); \ + int max = atomic_read(&F2FS_I_SB(inode)->max_vw_cnt); \ + if (cur > max) \ + atomic_set(&F2FS_I_SB(inode)->max_vw_cnt, cur); \ + } while (0) #define stat_inc_seg_count(sbi, type, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ - (si)->tot_segs++; \ - if (type == SUM_TYPE_DATA) { \ + si->tot_segs++; \ + if ((type) == SUM_TYPE_DATA) { \ si->data_segs++; \ si->bg_data_segs += (gc_type == BG_GC) ? 1 : 0; \ } else { \ @@ -2412,14 +2867,14 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) } while (0) #define stat_inc_tot_blk_count(si, blks) \ - (si->tot_blks += (blks)) + ((si)->tot_blks += (blks)) #define stat_inc_data_blk_count(sbi, blks, gc_type) \ do { \ struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->data_blks += (blks); \ - si->bg_data_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_data_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) #define stat_inc_node_blk_count(sbi, blks, gc_type) \ @@ -2427,40 +2882,43 @@ static inline struct f2fs_stat_info *F2FS_STAT(struct f2fs_sb_info *sbi) struct f2fs_stat_info *si = F2FS_STAT(sbi); \ stat_inc_tot_blk_count(si, blks); \ si->node_blks += (blks); \ - si->bg_node_blks += (gc_type == BG_GC) ? (blks) : 0; \ + si->bg_node_blks += ((gc_type) == BG_GC) ? (blks) : 0; \ } while (0) -int f2fs_build_stats(struct f2fs_sb_info *); -void f2fs_destroy_stats(struct f2fs_sb_info *); +int f2fs_build_stats(struct f2fs_sb_info *sbi); +void f2fs_destroy_stats(struct f2fs_sb_info *sbi); int __init f2fs_create_root_stats(void); void f2fs_destroy_root_stats(void); #else -#define stat_inc_cp_count(si) -#define stat_inc_bg_cp_count(si) -#define stat_inc_call_count(si) -#define stat_inc_bggc_count(si) -#define stat_inc_dirty_inode(sbi, type) -#define stat_dec_dirty_inode(sbi, type) -#define stat_inc_total_hit(sb) -#define stat_inc_rbtree_node_hit(sb) -#define stat_inc_largest_node_hit(sbi) -#define stat_inc_cached_node_hit(sbi) -#define stat_inc_inline_xattr(inode) -#define stat_dec_inline_xattr(inode) -#define stat_inc_inline_inode(inode) -#define stat_dec_inline_inode(inode) -#define stat_inc_inline_dir(inode) -#define stat_dec_inline_dir(inode) -#define stat_inc_atomic_write(inode) -#define stat_dec_atomic_write(inode) -#define stat_update_max_atomic_write(inode) -#define stat_inc_seg_type(sbi, curseg) -#define stat_inc_block_count(sbi, curseg) -#define stat_inc_inplace_blocks(sbi) -#define stat_inc_seg_count(sbi, type, gc_type) -#define stat_inc_tot_blk_count(si, blks) -#define stat_inc_data_blk_count(sbi, blks, gc_type) -#define stat_inc_node_blk_count(sbi, blks, gc_type) +#define stat_inc_cp_count(si) do { } while (0) +#define stat_inc_bg_cp_count(si) do { } while (0) +#define stat_inc_call_count(si) do { } while (0) +#define stat_inc_bggc_count(si) do { } while (0) +#define stat_inc_dirty_inode(sbi, type) do { } while (0) +#define stat_dec_dirty_inode(sbi, type) do { } while (0) +#define stat_inc_total_hit(sb) do { } while (0) +#define stat_inc_rbtree_node_hit(sb) do { } while (0) +#define stat_inc_largest_node_hit(sbi) do { } while (0) +#define stat_inc_cached_node_hit(sbi) do { } while (0) +#define stat_inc_inline_xattr(inode) do { } while (0) +#define stat_dec_inline_xattr(inode) do { } while (0) +#define stat_inc_inline_inode(inode) do { } while (0) +#define stat_dec_inline_inode(inode) do { } while (0) +#define stat_inc_inline_dir(inode) do { } while (0) +#define stat_dec_inline_dir(inode) do { } while (0) +#define stat_inc_atomic_write(inode) do { } while (0) +#define stat_dec_atomic_write(inode) do { } while (0) +#define stat_update_max_atomic_write(inode) do { } while (0) +#define stat_inc_volatile_write(inode) do { } while (0) +#define stat_dec_volatile_write(inode) do { } while (0) +#define stat_update_max_volatile_write(inode) do { } while (0) +#define stat_inc_seg_type(sbi, curseg) do { } while (0) +#define stat_inc_block_count(sbi, curseg) do { } while (0) +#define stat_inc_inplace_blocks(sbi) do { } while (0) +#define stat_inc_seg_count(sbi, type, gc_type) do { } while (0) +#define stat_inc_tot_blk_count(si, blks) do { } while (0) +#define stat_inc_data_blk_count(sbi, blks, gc_type) do { } while (0) +#define stat_inc_node_blk_count(sbi, blks, gc_type) do { } while (0) static inline int f2fs_build_stats(struct f2fs_sb_info *sbi) { return 0; } static inline void f2fs_destroy_stats(struct f2fs_sb_info *sbi) { } @@ -2483,52 +2941,78 @@ extern struct kmem_cache *inode_entry_slab; /* * inline.c */ -bool f2fs_may_inline_data(struct inode *); -bool f2fs_may_inline_dentry(struct inode *); -void read_inline_data(struct page *, struct page *); -bool truncate_inline_inode(struct page *, u64); -int f2fs_read_inline_data(struct inode *, struct page *); -int f2fs_convert_inline_page(struct dnode_of_data *, struct page *); -int f2fs_convert_inline_inode(struct inode *); -int f2fs_write_inline_data(struct inode *, struct page *); -bool recover_inline_data(struct inode *, struct page *); -struct f2fs_dir_entry *find_in_inline_dir(struct inode *, - struct fscrypt_name *, struct page **); -int make_empty_inline_dir(struct inode *inode, struct inode *, struct page *); -int f2fs_add_inline_entry(struct inode *, const struct qstr *, - const struct qstr *, struct inode *, nid_t, umode_t); -void f2fs_delete_inline_entry(struct f2fs_dir_entry *, struct page *, - struct inode *, struct inode *); -bool f2fs_empty_inline_dir(struct inode *); -int f2fs_read_inline_dir(struct file *, struct dir_context *, - struct fscrypt_str *); -int f2fs_inline_data_fiemap(struct inode *, - struct fiemap_extent_info *, __u64, __u64); +bool f2fs_may_inline_data(struct inode *inode); +bool f2fs_may_inline_dentry(struct inode *inode); +void read_inline_data(struct page *page, struct page *ipage); +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from); +int f2fs_read_inline_data(struct inode *inode, struct page *page); +int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_inode(struct inode *inode); +int f2fs_write_inline_data(struct inode *inode, struct page *page); +bool recover_inline_data(struct inode *inode, struct page *npage); +struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, + struct fscrypt_name *fname, struct page **res_page); +int make_empty_inline_dir(struct inode *inode, struct inode *parent, + struct page *ipage); +int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, + const struct qstr *orig_name, + struct inode *inode, nid_t ino, umode_t mode); +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, + struct inode *dir, struct inode *inode); +bool f2fs_empty_inline_dir(struct inode *dir); +int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, + struct fscrypt_str *fstr); +int f2fs_inline_data_fiemap(struct inode *inode, + struct fiemap_extent_info *fieinfo, + __u64 start, __u64 len); /* * shrinker.c */ -unsigned long f2fs_shrink_count(struct shrinker *, struct shrink_control *); -unsigned long f2fs_shrink_scan(struct shrinker *, struct shrink_control *); -void f2fs_join_shrinker(struct f2fs_sb_info *); -void f2fs_leave_shrinker(struct f2fs_sb_info *); +unsigned long f2fs_shrink_count(struct shrinker *shrink, + struct shrink_control *sc); +unsigned long f2fs_shrink_scan(struct shrinker *shrink, + struct shrink_control *sc); +void f2fs_join_shrinker(struct f2fs_sb_info *sbi); +void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); /* * extent_cache.c */ -unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *, int); -bool f2fs_init_extent_tree(struct inode *, struct f2fs_extent *); -void f2fs_drop_extent_tree(struct inode *); -unsigned int f2fs_destroy_extent_node(struct inode *); -void f2fs_destroy_extent_tree(struct inode *); -bool f2fs_lookup_extent_cache(struct inode *, pgoff_t, struct extent_info *); -void f2fs_update_extent_cache(struct dnode_of_data *); +struct rb_entry *__lookup_rb_tree(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs); +struct rb_node **__lookup_rb_tree_for_insert(struct f2fs_sb_info *sbi, + struct rb_root *root, struct rb_node **parent, + unsigned int ofs); +struct rb_entry *__lookup_rb_tree_ret(struct rb_root *root, + struct rb_entry *cached_re, unsigned int ofs, + struct rb_entry **prev_entry, struct rb_entry **next_entry, + struct rb_node ***insert_p, struct rb_node **insert_parent, + bool force); +bool __check_rb_tree_consistence(struct f2fs_sb_info *sbi, + struct rb_root *root); +unsigned int f2fs_shrink_extent_tree(struct f2fs_sb_info *sbi, int nr_shrink); +bool f2fs_init_extent_tree(struct inode *inode, struct f2fs_extent *i_ext); +void f2fs_drop_extent_tree(struct inode *inode); +unsigned int f2fs_destroy_extent_node(struct inode *inode); +void f2fs_destroy_extent_tree(struct inode *inode); +bool f2fs_lookup_extent_cache(struct inode *inode, pgoff_t pgofs, + struct extent_info *ei); +void f2fs_update_extent_cache(struct dnode_of_data *dn); void f2fs_update_extent_cache_range(struct dnode_of_data *dn, - pgoff_t, block_t, unsigned int); -void init_extent_cache_info(struct f2fs_sb_info *); + pgoff_t fofs, block_t blkaddr, unsigned int len); +void init_extent_cache_info(struct f2fs_sb_info *sbi); int __init create_extent_cache(void); void destroy_extent_cache(void); +/* + * sysfs.c + */ +int __init f2fs_init_sysfs(void); +void f2fs_exit_sysfs(void); +int f2fs_register_sysfs(struct f2fs_sb_info *sbi); +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi); + /* * crypto support */ @@ -2537,6 +3021,11 @@ static inline bool f2fs_encrypted_inode(struct inode *inode) return file_is_encrypt(inode); } +static inline bool f2fs_encrypted_file(struct inode *inode) +{ + return f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode); +} + static inline void f2fs_set_encrypted_inode(struct inode *inode) { #ifdef CONFIG_F2FS_FS_ENCRYPTION @@ -2559,6 +3048,21 @@ static inline int f2fs_sb_mounted_blkzoned(struct super_block *sb) return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_BLKZONED); } +static inline int f2fs_sb_has_extra_attr(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_EXTRA_ATTR); +} + +static inline int f2fs_sb_has_project_quota(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_PRJQUOTA); +} + +static inline int f2fs_sb_has_inode_chksum(struct super_block *sb) +{ + return F2FS_HAS_FEATURE(sb, F2FS_FEATURE_INODE_CHKSUM); +} + #ifdef CONFIG_BLK_DEV_ZONED static inline int get_blkz_type(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkaddr) @@ -2606,28 +3110,4 @@ static inline bool f2fs_may_encrypt(struct inode *inode) #endif } -#ifndef CONFIG_F2FS_FS_ENCRYPTION -#define fscrypt_set_d_op(i) -#define fscrypt_get_ctx fscrypt_notsupp_get_ctx -#define fscrypt_release_ctx fscrypt_notsupp_release_ctx -#define fscrypt_encrypt_page fscrypt_notsupp_encrypt_page -#define fscrypt_decrypt_page fscrypt_notsupp_decrypt_page -#define fscrypt_decrypt_bio_pages fscrypt_notsupp_decrypt_bio_pages -#define fscrypt_pullback_bio_page fscrypt_notsupp_pullback_bio_page -#define fscrypt_restore_control_page fscrypt_notsupp_restore_control_page -#define fscrypt_zeroout_range fscrypt_notsupp_zeroout_range -#define fscrypt_process_policy fscrypt_notsupp_process_policy -#define fscrypt_get_policy fscrypt_notsupp_get_policy -#define fscrypt_has_permitted_context fscrypt_notsupp_has_permitted_context -#define fscrypt_inherit_context fscrypt_notsupp_inherit_context -#define fscrypt_get_encryption_info fscrypt_notsupp_get_encryption_info -#define fscrypt_put_encryption_info fscrypt_notsupp_put_encryption_info -#define fscrypt_setup_filename fscrypt_notsupp_setup_filename -#define fscrypt_free_filename fscrypt_notsupp_free_filename -#define fscrypt_fname_encrypted_size fscrypt_notsupp_fname_encrypted_size -#define fscrypt_fname_alloc_buffer fscrypt_notsupp_fname_alloc_buffer -#define fscrypt_fname_free_buffer fscrypt_notsupp_fname_free_buffer -#define fscrypt_fname_disk_to_usr fscrypt_notsupp_fname_disk_to_usr -#define fscrypt_fname_usr_to_disk fscrypt_notsupp_fname_usr_to_disk -#endif #endif diff --git a/fs/f2fs/f2fs_crypto.h b/fs/f2fs/f2fs_crypto.h deleted file mode 100644 index f113f1a1e8c1..000000000000 --- a/fs/f2fs/f2fs_crypto.h +++ /dev/null @@ -1,150 +0,0 @@ -/* - * linux/fs/f2fs/f2fs_crypto.h - * - * Copied from linux/fs/ext4/ext4_crypto.h - * - * Copyright (C) 2015, Google, Inc. - * - * This contains encryption header content for f2fs - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ -#ifndef _F2FS_CRYPTO_H -#define _F2FS_CRYPTO_H - -#include - -#define F2FS_KEY_DESCRIPTOR_SIZE 8 - -/* Policy provided via an ioctl on the topmost directory */ -struct f2fs_encryption_policy { - char version; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; -} __attribute__((__packed__)); - -#define F2FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 -#define F2FS_KEY_DERIVATION_NONCE_SIZE 16 - -#define F2FS_POLICY_FLAGS_PAD_4 0x00 -#define F2FS_POLICY_FLAGS_PAD_8 0x01 -#define F2FS_POLICY_FLAGS_PAD_16 0x02 -#define F2FS_POLICY_FLAGS_PAD_32 0x03 -#define F2FS_POLICY_FLAGS_PAD_MASK 0x03 -#define F2FS_POLICY_FLAGS_VALID 0x03 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct f2fs_encryption_context { - char format; - char contents_encryption_mode; - char filenames_encryption_mode; - char flags; - char master_key_descriptor[F2FS_KEY_DESCRIPTOR_SIZE]; - char nonce[F2FS_KEY_DERIVATION_NONCE_SIZE]; -} __attribute__((__packed__)); - -/* Encryption parameters */ -#define F2FS_XTS_TWEAK_SIZE 16 -#define F2FS_AES_128_ECB_KEY_SIZE 16 -#define F2FS_AES_256_GCM_KEY_SIZE 32 -#define F2FS_AES_256_CBC_KEY_SIZE 32 -#define F2FS_AES_256_CTS_KEY_SIZE 32 -#define F2FS_AES_256_XTS_KEY_SIZE 64 -#define F2FS_MAX_KEY_SIZE 64 - -#define F2FS_KEY_DESC_PREFIX "f2fs:" -#define F2FS_KEY_DESC_PREFIX_SIZE 5 - -struct f2fs_encryption_key { - __u32 mode; - char raw[F2FS_MAX_KEY_SIZE]; - __u32 size; -} __attribute__((__packed__)); - -struct f2fs_crypt_info { - char ci_data_mode; - char ci_filename_mode; - char ci_flags; - struct crypto_ablkcipher *ci_ctfm; - char ci_master_key[F2FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define F2FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define F2FS_WRITE_PATH_FL 0x00000002 - -struct f2fs_crypto_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - char flags; /* Flags */ -}; - -struct f2fs_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_F2FS_COMPLETION_RESULT(ecr) \ - struct f2fs_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -static inline int f2fs_encryption_key_size(int mode) -{ - switch (mode) { - case F2FS_ENCRYPTION_MODE_AES_256_XTS: - return F2FS_AES_256_XTS_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_GCM: - return F2FS_AES_256_GCM_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CBC: - return F2FS_AES_256_CBC_KEY_SIZE; - case F2FS_ENCRYPTION_MODE_AES_256_CTS: - return F2FS_AES_256_CTS_KEY_SIZE; - default: - BUG(); - } - return 0; -} - -#define F2FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define F2FS_CRYPTO_BLOCK_SIZE 16 -#define F2FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct f2fs_encrypted_symlink_data { - __le16 len; - char encrypted_path[1]; -} __attribute__((__packed__)); - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 encrypted_symlink_data_len(u32 l) -{ - return (l + sizeof(struct f2fs_encrypted_symlink_data) - 1); -} -#endif /* _F2FS_CRYPTO_H */ diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index e4e5d76d80b0..531379f513fa 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -33,6 +34,19 @@ #include "trace.h" #include +static int f2fs_filemap_fault(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&F2FS_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&F2FS_I(inode)->i_mmap_sem); + + return err; +} + static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -60,13 +74,14 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, f2fs_balance_fs(sbi, dn.node_changed); file_update_time(vma->vm_file); + down_read(&F2FS_I(inode)->i_mmap_sem); lock_page(page); if (unlikely(page->mapping != inode->i_mapping || page_offset(page) > i_size_read(inode) || !PageUptodate(page))) { unlock_page(page); err = -EFAULT; - goto out; + goto out_sem; } /* @@ -86,15 +101,19 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, if (!PageUptodate(page)) SetPageUptodate(page); + f2fs_update_iostat(sbi, APP_MAPPED_IO, F2FS_BLKSIZE); + trace_f2fs_vm_page_mkwrite(page, DATA); mapped: /* fill the page */ f2fs_wait_on_page_writeback(page, DATA, false); /* wait for GCed encrypted page writeback */ - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - f2fs_wait_on_encrypted_page_writeback(sbi, dn.data_blkaddr); + if (f2fs_encrypted_file(inode)) + f2fs_wait_on_block_writeback(sbi, dn.data_blkaddr); +out_sem: + up_read(&F2FS_I(inode)->i_mmap_sem); out: sb_end_pagefault(inode->i_sb); f2fs_update_time(sbi, REQ_TIME); @@ -102,7 +121,7 @@ static int f2fs_vm_page_mkwrite(struct vm_area_struct *vma, } static const struct vm_operations_struct f2fs_file_vm_ops = { - .fault = filemap_fault, + .fault = f2fs_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = f2fs_vm_page_mkwrite, }; @@ -117,11 +136,6 @@ static int get_parent_ino(struct inode *inode, nid_t *pino) if (!dentry) return 0; - if (update_dent_inode(inode, inode, &dentry->d_name)) { - dput(dentry); - return 0; - } - *pino = parent_ino(dentry); dput(dentry); return 1; @@ -142,8 +156,6 @@ static inline bool need_do_checkpoint(struct inode *inode) need_cp = true; else if (!is_checkpointed_node(sbi, F2FS_I(inode)->i_pino)) need_cp = true; - else if (F2FS_I(inode)->xattr_ver == cur_cp_version(F2FS_CKPT(sbi))) - need_cp = true; else if (test_opt(sbi, FASTBOOT)) need_cp = true; else if (sbi->active_logs == 2) @@ -169,7 +181,6 @@ static void try_to_fix_pino(struct inode *inode) nid_t pino; down_write(&fi->i_sem); - fi->xattr_ver = 0; if (file_wrong_pino(inode) && inode->i_nlink == 1 && get_parent_ino(inode, &pino)) { f2fs_i_pino_write(inode, pino); @@ -268,9 +279,19 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, goto sync_nodes; } - ret = wait_on_node_pages_writeback(sbi, ino); - if (ret) - goto out; + /* + * If it's atomic_write, it's just fine to keep write ordering. So + * here we don't need to wait for node write completion, since we use + * node chain which serializes node blocks. If one of node writes are + * reordered, we can see simply broken chain, resulting in stopping + * roll-forward recovery. It means we'll recover all or none node blocks + * given fsync mark. + */ + if (!atomic) { + ret = wait_on_node_pages_writeback(sbi, ino); + if (ret) + goto out; + } /* once recovery info is written, don't need to tack this */ remove_ino_entry(sbi, ino, APPEND_INO); @@ -278,7 +299,8 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, flush_out: remove_ino_entry(sbi, ino, UPDATE_INO); clear_inode_flag(inode, FI_UPDATE_WRITE); - ret = f2fs_issue_flush(sbi); + if (!atomic) + ret = f2fs_issue_flush(sbi); f2fs_update_time(sbi, REQ_TIME); out: trace_f2fs_sync_file_exit(inode, need_cp, datasync, ret); @@ -375,7 +397,8 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) dn.ofs_in_node++, pgofs++, data_ofs = (loff_t)pgofs << PAGE_SHIFT) { block_t blkaddr; - blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (__found_offset(blkaddr, dirty, pgofs, whence)) { f2fs_put_dnode(&dn); @@ -423,14 +446,6 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) struct inode *inode = file_inode(file); int err; - if (f2fs_encrypted_inode(inode)) { - err = fscrypt_get_encryption_info(inode); - if (err) - return 0; - if (!f2fs_encrypted_inode(inode)) - return -ENOKEY; - } - /* we don't need to use inline_data strictly */ err = f2fs_convert_inline_inode(inode); if (err) @@ -443,11 +458,10 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int f2fs_file_open(struct inode *inode, struct file *filp) { - int ret = generic_file_open(inode, filp); struct dentry *dir; - if (!ret && f2fs_encrypted_inode(inode)) { - ret = fscrypt_get_encryption_info(inode); + if (f2fs_encrypted_inode(inode)) { + int ret = fscrypt_get_encryption_info(inode); if (ret) return -EACCES; if (!fscrypt_has_encryption_key(inode)) @@ -460,7 +474,7 @@ static int f2fs_file_open(struct inode *inode, struct file *filp) return -EPERM; } dput(dir); - return ret; + return dquot_file_open(inode, filp); } int truncate_data_blocks_range(struct dnode_of_data *dn, int count) @@ -469,9 +483,13 @@ int truncate_data_blocks_range(struct dnode_of_data *dn, int count) struct f2fs_node *raw_node; int nr_free = 0, ofs = dn->ofs_in_node, len = count; __le32 *addr; + int base = 0; + + if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode)) + base = get_extra_isize(dn->inode); raw_node = F2FS_NODE(dn->node_page); - addr = blkaddr_in_node(raw_node) + ofs; + addr = blkaddr_in_node(raw_node) + base + ofs; for (; count > 0; count--, addr++, dn->ofs_in_node++) { block_t blkaddr = le32_to_cpu(*addr); @@ -531,12 +549,14 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, page = get_lock_data_page(inode, index, true); if (IS_ERR(page)) - return 0; + return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); truncate_out: f2fs_wait_on_page_writeback(page, DATA, true); zero_user(page, offset, PAGE_SIZE - offset); - if (!cache_only || !f2fs_encrypted_inode(inode) || - !S_ISREG(inode->i_mode)) + + /* An encrypted inode should have a key and truncate the last page. */ + f2fs_bug_on(F2FS_I_SB(inode), cache_only && f2fs_encrypted_inode(inode)); + if (!cache_only) set_page_dirty(page); f2fs_put_page(page, 1); return 0; @@ -569,10 +589,7 @@ int truncate_blocks(struct inode *inode, u64 from, bool lock) } if (f2fs_has_inline_data(inode)) { - if (truncate_inline_inode(ipage, from)) - set_page_dirty(ipage); - if (from == 0) - clear_inode_flag(inode, FI_DATA_EXIST); + truncate_inline_inode(inode, ipage, from); f2fs_put_page(ipage, 1); truncate_page = true; goto out; @@ -621,6 +638,12 @@ int f2fs_truncate(struct inode *inode) trace_f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(F2FS_I_SB(inode), FAULT_TRUNCATE)) { + f2fs_show_injection_info(FAULT_TRUNCATE); + return -EIO; + } +#endif /* we should check inline_data size */ if (!f2fs_may_inline_data(inode)) { err = f2fs_convert_inline_inode(inode); @@ -642,7 +665,6 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); - stat->blocks <<= 3; return 0; } @@ -686,14 +708,34 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) if (err) return err; + if (is_quota_modification(inode, attr)) { + err = dquot_initialize(inode); + if (err) + return err; + } + if ((attr->ia_valid & ATTR_UID && + !uid_eq(attr->ia_uid, inode->i_uid)) || + (attr->ia_valid & ATTR_GID && + !gid_eq(attr->ia_gid, inode->i_gid))) { + err = dquot_transfer(inode, attr); + if (err) + return err; + } + if (attr->ia_valid & ATTR_SIZE) { - if (f2fs_encrypted_inode(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; + if (f2fs_encrypted_inode(inode)) { + err = fscrypt_get_encryption_info(inode); + if (err) + return err; + if (!fscrypt_has_encryption_key(inode)) + return -ENOKEY; + } if (attr->ia_size <= i_size_read(inode)) { + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); err = f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); if (err) return err; } else { @@ -701,7 +743,9 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr) * do not trim all blocks after i_size if target size is * larger than i_size. */ + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_setsize(inode, attr->ia_size); + up_write(&F2FS_I(inode)->i_mmap_sem); /* should convert inline inode here */ if (!f2fs_may_inline_data(inode)) { @@ -847,12 +891,14 @@ static int punch_hole(struct inode *inode, loff_t offset, loff_t len) blk_start = (loff_t)pg_start << PAGE_SHIFT; blk_end = (loff_t)pg_end << PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); truncate_inode_pages_range(mapping, blk_start, blk_end - 1); f2fs_lock_op(sbi); ret = truncate_hole(inode, pg_start, pg_end); f2fs_unlock_op(sbi); + up_write(&F2FS_I(inode)->i_mmap_sem); } } @@ -883,7 +929,8 @@ static int __read_out_blkaddrs(struct inode *inode, block_t *blkaddr, done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { - *blkaddr = datablock_addr(dn.node_page, dn.ofs_in_node); + *blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); if (!is_checkpointed_data(sbi, *blkaddr)) { if (test_opt(sbi, LFS)) { @@ -959,15 +1006,15 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, ADDRS_PER_PAGE(dn.node_page, dst_inode) - dn.ofs_in_node, len - i); do { - dn.data_blkaddr = datablock_addr(dn.node_page, - dn.ofs_in_node); + dn.data_blkaddr = datablock_addr(dn.inode, + dn.node_page, dn.ofs_in_node); truncate_data_blocks_range(&dn, 1); if (do_replace[i]) { f2fs_i_blocks_write(src_inode, - 1, false); + 1, false, false); f2fs_i_blocks_write(dst_inode, - 1, true); + 1, true, false); f2fs_replace_block(sbi, &dn, dn.data_blkaddr, blkaddr[i], ni.version, true, false); @@ -1019,11 +1066,11 @@ static int __exchange_data_block(struct inode *src_inode, while (len) { olen = min((pgoff_t)4 * ADDRS_PER_BLOCK, len); - src_blkaddr = f2fs_kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); + src_blkaddr = kvzalloc(sizeof(block_t) * olen, GFP_KERNEL); if (!src_blkaddr) return -ENOMEM; - do_replace = f2fs_kvzalloc(sizeof(int) * olen, GFP_KERNEL); + do_replace = kvzalloc(sizeof(int) * olen, GFP_KERNEL); if (!do_replace) { kvfree(src_blkaddr); return -ENOMEM; @@ -1091,16 +1138,17 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) pg_start = offset >> PAGE_SHIFT; pg_end = (offset + len) >> PAGE_SHIFT; + down_write(&F2FS_I(inode)->i_mmap_sem); /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); ret = f2fs_do_collapse(inode, pg_start, pg_end); if (ret) - return ret; + goto out; /* write out all moved pages, if possible */ filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); @@ -1113,6 +1161,8 @@ static int f2fs_collapse_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1126,7 +1176,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, int ret; for (; index < end; index++, dn->ofs_in_node++) { - if (datablock_addr(dn->node_page, dn->ofs_in_node) == NULL_ADDR) + if (datablock_addr(dn->inode, dn->node_page, + dn->ofs_in_node) == NULL_ADDR) count++; } @@ -1137,8 +1188,8 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start, dn->ofs_in_node = ofs_in_node; for (index = start; index < end; index++, dn->ofs_in_node++) { - dn->data_blkaddr = - datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); /* * reserve_new_blocks will not guarantee entire block * allocation. @@ -1177,9 +1228,10 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, if (ret) return ret; + down_write(&F2FS_I(inode)->i_mmap_sem); ret = filemap_write_and_wait_range(mapping, offset, offset + len - 1); if (ret) - return ret; + goto out_sem; truncate_pagecache_range(inode, offset, offset + len - 1); @@ -1193,17 +1245,15 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, ret = fill_zero(inode, pg_start, off_start, off_end - off_start); if (ret) - return ret; + goto out_sem; - if (offset + len > new_size) - new_size = offset + len; new_size = max_t(loff_t, new_size, offset + len); } else { if (off_start) { ret = fill_zero(inode, pg_start++, off_start, PAGE_SIZE - off_start); if (ret) - return ret; + goto out_sem; new_size = max_t(loff_t, new_size, (loff_t)pg_start << PAGE_SHIFT); @@ -1252,6 +1302,8 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, out: if (!(mode & FALLOC_FL_KEEP_SIZE) && i_size_read(inode) < new_size) f2fs_i_size_write(inode, new_size); +out_sem: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1264,8 +1316,9 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) int ret = 0; new_size = i_size_read(inode) + len; - if (new_size > inode->i_sb->s_maxbytes) - return -EFBIG; + ret = inode_newsize_ok(inode, new_size); + if (ret) + return ret; if (offset >= i_size_read(inode)) return -EINVAL; @@ -1280,14 +1333,15 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) f2fs_balance_fs(sbi, true); + down_write(&F2FS_I(inode)->i_mmap_sem); ret = truncate_blocks(inode, i_size_read(inode), true); if (ret) - return ret; + goto out; /* write out all dirty pages from offset */ ret = filemap_write_and_wait_range(inode->i_mapping, offset, LLONG_MAX); if (ret) - return ret; + goto out; truncate_pagecache(inode, offset); @@ -1316,6 +1370,8 @@ static int f2fs_insert_range(struct inode *inode, loff_t offset, loff_t len) if (!ret) f2fs_i_size_write(inode, new_size); +out: + up_write(&F2FS_I(inode)->i_mmap_sem); return ret; } @@ -1435,6 +1491,7 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); set_inode_flag(inode, FI_DROP_CACHE); filemap_fdatawrite(inode->i_mapping); clear_inode_flag(inode, FI_DROP_CACHE); @@ -1442,17 +1499,20 @@ static int f2fs_release_file(struct inode *inode, struct file *filp) return 0; } -#define F2FS_REG_FLMASK (~(FS_DIRSYNC_FL | FS_TOPDIR_FL)) -#define F2FS_OTHER_FLMASK (FS_NODUMP_FL | FS_NOATIME_FL) - -static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags) +static int f2fs_file_flush(struct file *file, fl_owner_t id) { - if (S_ISDIR(mode)) - return flags; - else if (S_ISREG(mode)) - return flags & F2FS_REG_FLMASK; - else - return flags & F2FS_OTHER_FLMASK; + struct inode *inode = file_inode(file); + + /* + * If the process doing a transaction is crashed, we should do + * roll-back. Otherwise, other reader/write can see corrupted database + * until all the writers close its file. Since this should be done + * before dropping file lock, it needs to do in ->flush. + */ + if (f2fs_is_atomic_file(inode) && + F2FS_I(inode)->inmem_task == current) + drop_inmem_pages(inode); + return 0; } static int f2fs_ioc_getflags(struct file *filp, unsigned long arg) @@ -1481,28 +1541,34 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) if (ret) return ret; - flags = f2fs_mask_flags(inode->i_mode, flags); - inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) { + ret = -EPERM; + goto unlock_out; + } + + flags = f2fs_mask_flags(inode->i_mode, flags); + oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - inode_unlock(inode); ret = -EPERM; - goto out; + goto unlock_out; } } flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - inode_unlock(inode); inode->i_ctime = current_time(inode); f2fs_set_inode_flags(inode); -out: + f2fs_mark_inode_dirty_sync(inode, false); +unlock_out: + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } @@ -1522,6 +1588,9 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1536,20 +1605,27 @@ static int f2fs_ioc_start_atomic_write(struct file *filp) goto out; set_inode_flag(inode, FI_ATOMIC_FILE); + set_inode_flag(inode, FI_HOT_DATA); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); if (!get_dirty_pages(inode)) - goto out; + goto inc_stat; f2fs_msg(F2FS_I_SB(inode)->sb, KERN_WARNING, "Unexpected flush for atomic writes: ino=%lu, npages=%u", inode->i_ino, get_dirty_pages(inode)); ret = filemap_write_and_wait_range(inode->i_mapping, 0, LLONG_MAX); - if (ret) + if (ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); -out: + clear_inode_flag(inode, FI_HOT_DATA); + goto out; + } + +inc_stat: + F2FS_I(inode)->inmem_task = current; stat_inc_atomic_write(inode); stat_update_max_atomic_write(inode); +out: inode_unlock(inode); mnt_drop_write_file(filp); return ret; @@ -1580,10 +1656,11 @@ static int f2fs_ioc_commit_atomic_write(struct file *filp) ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); if (!ret) { clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } } else { - ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); + ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 1, false); } err_out: inode_unlock(inode); @@ -1599,6 +1676,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (!inode_owner_or_capable(inode)) return -EACCES; + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + ret = mnt_want_write_file(filp); if (ret) return ret; @@ -1612,6 +1692,9 @@ static int f2fs_ioc_start_volatile_write(struct file *filp) if (ret) goto out; + stat_inc_volatile_write(inode); + stat_update_max_volatile_write(inode); + set_inode_flag(inode, FI_VOLATILE_FILE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); out: @@ -1667,6 +1750,7 @@ static int f2fs_ioc_abort_volatile_write(struct file *filp) drop_inmem_pages(inode); if (f2fs_is_volatile_file(inode)) { clear_inode_flag(inode, FI_VOLATILE_FILE); + stat_dec_volatile_write(inode); ret = f2fs_do_sync_file(filp, 0, LLONG_MAX, 0, true); } @@ -1712,7 +1796,7 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) f2fs_stop_checkpoint(sbi, false); break; case F2FS_GOING_DOWN_METAFLUSH: - sync_meta_pages(sbi, META, LONG_MAX); + sync_meta_pages(sbi, META, LONG_MAX, FS_META_IO); f2fs_stop_checkpoint(sbi, false); break; default: @@ -1773,31 +1857,16 @@ static bool uuid_is_nonzero(__u8 u[16]) static int f2fs_ioc_set_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; struct inode *inode = file_inode(filp); - if (copy_from_user(&policy, (struct fscrypt_policy __user *)arg, - sizeof(policy))) - return -EFAULT; - f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); - return fscrypt_process_policy(filp, &policy); + return fscrypt_ioctl_set_policy(filp, (const void __user *)arg); } static int f2fs_ioc_get_encryption_policy(struct file *filp, unsigned long arg) { - struct fscrypt_policy policy; - struct inode *inode = file_inode(filp); - int err; - - err = fscrypt_get_policy(inode, &policy); - if (err) - return err; - - if (copy_to_user((struct fscrypt_policy __user *)arg, &policy, sizeof(policy))) - return -EFAULT; - return 0; + return fscrypt_ioctl_get_policy(filp, (void __user *)arg); } static int f2fs_ioc_get_encryption_pwsalt(struct file *filp, unsigned long arg) @@ -1863,7 +1932,51 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) mutex_lock(&sbi->gc_mutex); } - ret = f2fs_gc(sbi, sync, true); + ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_gc_range(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct f2fs_gc_range range; + u64 end; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&range, (struct f2fs_gc_range __user *)arg, + sizeof(range))) + return -EFAULT; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + end = range.start + range.len; + if (range.start < MAIN_BLKADDR(sbi) || end >= MAX_BLKADDR(sbi)) + return -EINVAL; +do_more: + if (!range.sync) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + } else { + mutex_lock(&sbi->gc_mutex); + } + + ret = f2fs_gc(sbi, range.sync, true, GET_SEGNO(sbi, range.start)); + range.start += sbi->blocks_per_seg; + if (range.start <= end) + goto do_more; out: mnt_drop_write_file(filp); return ret; @@ -1897,17 +2010,16 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_pgofs = NULL }; - struct extent_info ei; + struct extent_info ei = {0,0,0}; pgoff_t pg_start, pg_end; unsigned int blk_per_seg = sbi->blocks_per_seg; unsigned int total = 0, sec_num; - unsigned int pages_per_sec = sbi->segs_per_sec * blk_per_seg; block_t blk_end = 0; bool fragmented = false; int err; /* if in-place-update policy is enabled, don't waste time here */ - if (need_inplace_update(inode)) + if (need_inplace_update_policy(inode, NULL)) return -EINVAL; pg_start = range->start >> PAGE_SHIFT; @@ -1941,7 +2053,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, */ while (map.m_lblk < pg_end) { map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto out; @@ -1965,7 +2077,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, map.m_lblk = pg_start; map.m_len = pg_end - pg_start; - sec_num = (map.m_len + pages_per_sec - 1) / pages_per_sec; + sec_num = (map.m_len + BLKS_PER_SEC(sbi) - 1) / BLKS_PER_SEC(sbi); /* * make sure there are enough free section for LFS allocation, this can @@ -1983,7 +2095,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, do_map: map.m_len = pg_end - map.m_lblk; - err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_READ); + err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT); if (err) goto clear_out; @@ -2042,42 +2154,40 @@ static int f2fs_ioc_defragment(struct file *filp, unsigned long arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || f2fs_is_atomic_file(inode)) + return -EINVAL; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, + sizeof(range))) + return -EFAULT; + + /* verify alignment of offset & size */ + if (range.start & (F2FS_BLKSIZE - 1) || range.len & (F2FS_BLKSIZE - 1)) + return -EINVAL; + + if (unlikely((range.start + range.len) >> PAGE_SHIFT > + sbi->max_file_blocks)) return -EINVAL; err = mnt_want_write_file(filp); if (err) return err; - if (f2fs_readonly(sbi->sb)) { - err = -EROFS; - goto out; - } - - if (copy_from_user(&range, (struct f2fs_defragment __user *)arg, - sizeof(range))) { - err = -EFAULT; - goto out; - } - - /* verify alignment of offset & size */ - if (range.start & (F2FS_BLKSIZE - 1) || - range.len & (F2FS_BLKSIZE - 1)) { - err = -EINVAL; - goto out; - } - err = f2fs_defragment_range(sbi, filp, &range); + mnt_drop_write_file(filp); + f2fs_update_time(sbi, REQ_TIME); if (err < 0) - goto out; + return err; if (copy_to_user((struct f2fs_defragment __user *)arg, &range, sizeof(range))) - err = -EFAULT; -out: - mnt_drop_write_file(filp); - return err; + return -EFAULT; + + return 0; } static int f2fs_move_file_range(struct file *file_in, loff_t pos_in, @@ -2211,6 +2321,8 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) range.pos_out, range.len); mnt_drop_write_file(filp); + if (err) + goto err_out; if (copy_to_user((struct f2fs_move_range __user *)arg, &range, sizeof(range))) @@ -2220,6 +2332,79 @@ static int f2fs_ioc_move_range(struct file *filp, unsigned long arg) return err; } +static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct sit_info *sm = SIT_I(sbi); + unsigned int start_segno = 0, end_segno = 0; + unsigned int dev_start_segno = 0, dev_end_segno = 0; + struct f2fs_flush_device range; + int ret; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (f2fs_readonly(sbi->sb)) + return -EROFS; + + if (copy_from_user(&range, (struct f2fs_flush_device __user *)arg, + sizeof(range))) + return -EFAULT; + + if (sbi->s_ndevs <= 1 || sbi->s_ndevs - 1 <= range.dev_num || + sbi->segs_per_sec != 1) { + f2fs_msg(sbi->sb, KERN_WARNING, + "Can't flush %u in %d for segs_per_sec %u != 1\n", + range.dev_num, sbi->s_ndevs, + sbi->segs_per_sec); + return -EINVAL; + } + + ret = mnt_want_write_file(filp); + if (ret) + return ret; + + if (range.dev_num != 0) + dev_start_segno = GET_SEGNO(sbi, FDEV(range.dev_num).start_blk); + dev_end_segno = GET_SEGNO(sbi, FDEV(range.dev_num).end_blk); + + start_segno = sm->last_victim[FLUSH_DEVICE]; + if (start_segno < dev_start_segno || start_segno >= dev_end_segno) + start_segno = dev_start_segno; + end_segno = min(start_segno + range.segments, dev_end_segno); + + while (start_segno < end_segno) { + if (!mutex_trylock(&sbi->gc_mutex)) { + ret = -EBUSY; + goto out; + } + sm->last_victim[GC_CB] = end_segno + 1; + sm->last_victim[GC_GREEDY] = end_segno + 1; + sm->last_victim[ALLOC_NEXT] = end_segno + 1; + ret = f2fs_gc(sbi, true, true, start_segno); + if (ret == -EAGAIN) + ret = 0; + else if (ret < 0) + break; + start_segno++; + } +out: + mnt_drop_write_file(filp); + return ret; +} + +static int f2fs_ioc_get_features(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + u32 sb_feature = le32_to_cpu(F2FS_I_SB(inode)->raw_super->feature); + + /* Must validate to set it with SQLite behavior in Android. */ + sb_feature |= F2FS_FEATURE_ATOMIC_WRITE; + + return put_user(sb_feature, (u32 __user *)arg); +} + long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { switch (cmd) { @@ -2251,12 +2436,18 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_get_encryption_pwsalt(filp, arg); case F2FS_IOC_GARBAGE_COLLECT: return f2fs_ioc_gc(filp, arg); + case F2FS_IOC_GARBAGE_COLLECT_RANGE: + return f2fs_ioc_gc_range(filp, arg); case F2FS_IOC_WRITE_CHECKPOINT: return f2fs_ioc_write_checkpoint(filp, arg); case F2FS_IOC_DEFRAGMENT: return f2fs_ioc_defragment(filp, arg); case F2FS_IOC_MOVE_RANGE: return f2fs_ioc_move_range(filp, arg); + case F2FS_IOC_FLUSH_DEVICE: + return f2fs_ioc_flush_device(filp, arg); + case F2FS_IOC_GET_FEATURES: + return f2fs_ioc_get_features(filp, arg); default: return -ENOTTY; } @@ -2269,16 +2460,15 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct blk_plug plug; ssize_t ret; - if (f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode) && - fscrypt_get_encryption_info(inode)) - return -EACCES; - inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) { - int err = f2fs_preallocate_blocks(iocb, from); + int err; + if (iov_iter_fault_in_readable(from, iov_iter_count(from))) + set_inode_flag(inode, FI_NO_PREALLOC); + + err = f2fs_preallocate_blocks(iocb, from); if (err) { inode_unlock(inode); return err; @@ -2286,6 +2476,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); blk_finish_plug(&plug); + clear_inode_flag(inode, FI_NO_PREALLOC); + + if (ret > 0) + f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); } inode_unlock(inode); @@ -2322,10 +2516,12 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_GET_ENCRYPTION_PWSALT: case F2FS_IOC_GET_ENCRYPTION_POLICY: case F2FS_IOC_GARBAGE_COLLECT: + case F2FS_IOC_GARBAGE_COLLECT_RANGE: case F2FS_IOC_WRITE_CHECKPOINT: case F2FS_IOC_DEFRAGMENT: - break; case F2FS_IOC_MOVE_RANGE: + case F2FS_IOC_FLUSH_DEVICE: + case F2FS_IOC_GET_FEATURES: break; default: return -ENOIOCTLCMD; @@ -2341,6 +2537,7 @@ const struct file_operations f2fs_file_operations = { .open = f2fs_file_open, .release = f2fs_release_file, .mmap = f2fs_file_mmap, + .flush = f2fs_file_flush, .fsync = f2fs_sync_file, .fallocate = f2fs_fallocate, .unlocked_ioctl = f2fs_ioctl, diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 7f0c3e02408c..bd16e6631cf3 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -28,17 +28,23 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; - long wait_ms; + unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; + set_freezable(); do { + wait_event_interruptible_timeout(*wq, + kthread_should_stop() || freezing(current) || + gc_th->gc_wake, + msecs_to_jiffies(wait_ms)); + + /* give it a try one time */ + if (gc_th->gc_wake) + gc_th->gc_wake = 0; + if (try_to_freeze()) continue; - else - wait_event_interruptible_timeout(*wq, - kthread_should_stop(), - msecs_to_jiffies(wait_ms)); if (kthread_should_stop()) break; @@ -48,10 +54,15 @@ static int gc_thread_func(void *data) } #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif + if (!sb_start_write_trylock(sbi->sb)) + continue; + /* * [GC triggering condition] * 0. GC is not conducted currently. @@ -66,23 +77,28 @@ static int gc_thread_func(void *data) * So, I'd like to wait some time to collect dirty segments. */ if (!mutex_trylock(&sbi->gc_mutex)) - continue; + goto next; + + if (gc_th->gc_urgent) { + wait_ms = gc_th->urgent_sleep_time; + goto do_gc; + } if (!is_idle(sbi)) { increase_sleep_time(gc_th, &wait_ms); mutex_unlock(&sbi->gc_mutex); - continue; + goto next; } if (has_enough_invalid_blocks(sbi)) decrease_sleep_time(gc_th, &wait_ms); else increase_sleep_time(gc_th, &wait_ms); - +do_gc: stat_inc_bggc_count(sbi); /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true)) + if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; trace_f2fs_background_gc(sbi->sb, wait_ms, @@ -90,6 +106,8 @@ static int gc_thread_func(void *data) /* balancing f2fs's metadata periodically */ f2fs_balance_fs_bg(sbi); +next: + sb_end_write(sbi->sb); } while (!kthread_should_stop()); return 0; @@ -107,11 +125,14 @@ int start_gc_thread(struct f2fs_sb_info *sbi) goto out; } + gc_th->urgent_sleep_time = DEF_GC_THREAD_URGENT_SLEEP_TIME; gc_th->min_sleep_time = DEF_GC_THREAD_MIN_SLEEP_TIME; gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; gc_th->gc_idle = 0; + gc_th->gc_urgent = 0; + gc_th->gc_wake= 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); @@ -166,10 +187,15 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type, p->ofs_unit = sbi->segs_per_sec; } - if (p->max_search > sbi->max_victim_search) + /* we need to check every dirty segments in the FG_GC case */ + if (gc_type != FG_GC && p->max_search > sbi->max_victim_search) p->max_search = sbi->max_victim_search; - p->offset = sbi->last_victim[p->gc_mode]; + /* let's select beginning hot/small space first */ + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + p->offset = 0; + else + p->offset = SIT_I(sbi)->last_victim[p->gc_mode]; } static unsigned int get_max_cost(struct f2fs_sb_info *sbi, @@ -179,7 +205,7 @@ static unsigned int get_max_cost(struct f2fs_sb_info *sbi, if (p->alloc_mode == SSR) return sbi->blocks_per_seg; if (p->gc_mode == GC_GREEDY) - return sbi->blocks_per_seg * p->ofs_unit; + return 2 * sbi->blocks_per_seg * p->ofs_unit; else if (p->gc_mode == GC_CB) return UINT_MAX; else /* No other gc_mode */ @@ -199,8 +225,12 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) for_each_set_bit(secno, dirty_i->victim_secmap, MAIN_SECS(sbi)) { if (sec_usage_check(sbi, secno)) continue; + + if (no_fggc_candidate(sbi, secno)) + continue; + clear_bit(secno, dirty_i->victim_secmap); - return secno * sbi->segs_per_sec; + return GET_SEG_FROM_SEC(sbi, secno); } return NULL_SEGNO; } @@ -208,8 +238,8 @@ static unsigned int check_bg_victims(struct f2fs_sb_info *sbi) static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - unsigned int secno = GET_SECNO(sbi, segno); - unsigned int start = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start = GET_SEG_FROM_SEC(sbi, secno); unsigned long long mtime = 0; unsigned int vblocks; unsigned char age = 0; @@ -218,7 +248,7 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) for (i = 0; i < sbi->segs_per_sec; i++) mtime += get_seg_entry(sbi, start + i)->mtime; - vblocks = get_valid_blocks(sbi, segno, sbi->segs_per_sec); + vblocks = get_valid_blocks(sbi, segno, true); mtime = div_u64(mtime, sbi->segs_per_sec); vblocks = div_u64(vblocks, sbi->segs_per_sec); @@ -237,6 +267,16 @@ static unsigned int get_cb_cost(struct f2fs_sb_info *sbi, unsigned int segno) return UINT_MAX - ((100 * (100 - u) * age) / (100 + u)); } +static unsigned int get_greedy_cost(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int valid_blocks = + get_valid_blocks(sbi, segno, true); + + return IS_DATASEG(get_seg_entry(sbi, segno)->type) ? + valid_blocks * 2 : valid_blocks; +} + static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, unsigned int segno, struct victim_sel_policy *p) { @@ -245,7 +285,7 @@ static inline unsigned int get_gc_cost(struct f2fs_sb_info *sbi, /* alloc_mode == LFS */ if (p->gc_mode == GC_GREEDY) - return get_valid_blocks(sbi, segno, sbi->segs_per_sec); + return get_greedy_cost(sbi, segno); else return get_cb_cost(sbi, segno); } @@ -274,6 +314,7 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, unsigned int *result, int gc_type, int type, char alloc_mode) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + struct sit_info *sm = SIT_I(sbi); struct victim_sel_policy p; unsigned int secno, last_victim; unsigned int last_segment = MAIN_SEGS(sbi); @@ -287,10 +328,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, p.min_segno = NULL_SEGNO; p.min_cost = get_max_cost(sbi, &p); + if (*result != NULL_SEGNO) { + if (IS_DATASEG(get_seg_entry(sbi, *result)->type) && + get_valid_blocks(sbi, *result, false) && + !sec_usage_check(sbi, GET_SEC_FROM_SEG(sbi, *result))) + p.min_segno = *result; + goto out; + } + if (p.max_search == 0) goto out; - last_victim = sbi->last_victim[p.gc_mode]; + last_victim = sm->last_victim[p.gc_mode]; if (p.alloc_mode == LFS && gc_type == FG_GC) { p.min_segno = check_bg_victims(sbi); if (p.min_segno != NULL_SEGNO) @@ -303,9 +352,10 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, segno = find_next_bit(p.dirty_segmap, last_segment, p.offset); if (segno >= last_segment) { - if (sbi->last_victim[p.gc_mode]) { - last_segment = sbi->last_victim[p.gc_mode]; - sbi->last_victim[p.gc_mode] = 0; + if (sm->last_victim[p.gc_mode]) { + last_segment = + sm->last_victim[p.gc_mode]; + sm->last_victim[p.gc_mode] = 0; p.offset = 0; continue; } @@ -322,13 +372,15 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, nsearched++; } - - secno = GET_SECNO(sbi, segno); + secno = GET_SEC_FROM_SEG(sbi, segno); if (sec_usage_check(sbi, secno)) goto next; if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; + if (gc_type == FG_GC && p.alloc_mode == LFS && + no_fggc_candidate(sbi, secno)) + goto next; cost = get_gc_cost(sbi, segno, &p); @@ -338,17 +390,18 @@ static int get_victim_by_default(struct f2fs_sb_info *sbi, } next: if (nsearched >= p.max_search) { - if (!sbi->last_victim[p.gc_mode] && segno <= last_victim) - sbi->last_victim[p.gc_mode] = last_victim + 1; + if (!sm->last_victim[p.gc_mode] && segno <= last_victim) + sm->last_victim[p.gc_mode] = last_victim + 1; else - sbi->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] = segno + 1; + sm->last_victim[p.gc_mode] %= MAIN_SEGS(sbi); break; } } if (p.min_segno != NULL_SEGNO) { got_it: if (p.alloc_mode == LFS) { - secno = GET_SECNO(sbi, p.min_segno); + secno = GET_SEC_FROM_SEG(sbi, p.min_segno); if (gc_type == FG_GC) sbi->cur_victim_sec = secno; else @@ -531,12 +584,14 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, get_node_info(sbi, nid, dni); if (sum->version != dni->version) { - f2fs_put_page(node_page, 1); - return false; + f2fs_msg(sbi->sb, KERN_WARNING, + "%s: valid data with mismatched node version.", + __func__); + set_sbi_flag(sbi, SBI_NEED_FSCK); } *nofs = ofs_of_node(node_page); - source_blkaddr = datablock_addr(node_page, ofs_in_node); + source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node); f2fs_put_page(node_page, 1); if (source_blkaddr != blkaddr) @@ -544,15 +599,21 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, return true; } -static void move_encrypted_block(struct inode *inode, block_t bidx, - unsigned int segno, int off) +/* + * Move data block via META_MAPPING while keeping locked data page. + * This can be used to move blocks, aka LBAs, directly on disk. + */ +static void move_data_block(struct inode *inode, block_t bidx, + unsigned int segno, int off) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_READ, .op_flags = REQ_SYNC, .encrypted_page = NULL, + .in_list = false, }; struct dnode_of_data dn; struct f2fs_summary sum; @@ -596,7 +657,7 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, - &sum, CURSEG_COLD_DATA); + &sum, CURSEG_COLD_DATA, NULL, false); fio.encrypted_page = pagecache_get_page(META_MAPPING(fio.sbi), newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); @@ -634,7 +695,9 @@ static void move_encrypted_block(struct inode *inode, block_t bidx, fio.op = REQ_OP_WRITE; fio.op_flags = REQ_SYNC | REQ_NOIDLE; fio.new_blkaddr = newaddr; - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(fio.sbi, FS_GC_DATA_IO, F2FS_BLKSIZE); f2fs_update_data_blkaddr(&dn, newaddr); set_inode_flag(inode, FI_APPEND_WRITE); @@ -676,10 +739,14 @@ static void move_data_page(struct inode *inode, block_t bidx, int gc_type, struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), .type = DATA, + .temp = COLD, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE, + .op_flags = REQ_SYNC, + .old_blkaddr = NULL_ADDR, .page = page, .encrypted_page = NULL, + .need_lock = LOCK_REQ, + .io_type = FS_GC_DATA_IO, }; bool is_dirty = PageDirty(page); int err; @@ -768,8 +835,7 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; /* if encrypted inode, let's go phase 3 */ - if (f2fs_encrypted_inode(inode) && - S_ISREG(inode->i_mode)) { + if (f2fs_encrypted_file(inode)) { add_gc_inode(gc_list, inode); continue; } @@ -803,14 +869,18 @@ static void gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, continue; } locked = true; + + /* wait for all inflight aio data */ + inode_dio_wait(inode); } start_bidx = start_bidx_of_node(nofs, inode) + ofs_in_node; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) - move_encrypted_block(inode, start_bidx, segno, off); + if (f2fs_encrypted_file(inode)) + move_data_block(inode, start_bidx, segno, off); else - move_data_page(inode, start_bidx, gc_type, segno, off); + move_data_page(inode, start_bidx, gc_type, + segno, off); if (locked) { up_write(&fi->dio_rwsem[WRITE]); @@ -847,7 +917,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + sbi->segs_per_sec; - int sec_freed = 0; + int seg_freed = 0; unsigned char type = IS_DATASEG(get_seg_entry(sbi, segno)->type) ? SUM_TYPE_DATA : SUM_TYPE_NODE; @@ -871,7 +941,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, GET_SUM_BLOCK(sbi, segno)); f2fs_put_page(sum_page, 0); - if (get_valid_blocks(sbi, segno, 1) == 0 || + if (get_valid_blocks(sbi, segno, false) == 0 || !PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) goto next; @@ -886,7 +956,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, * - mutex_lock(sentry_lock) - change_curseg() * - lock_page(sum_page) */ - if (type == SUM_TYPE_NODE) gc_node_segment(sbi, sum->entries, segno, gc_type); else @@ -894,90 +963,113 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); stat_inc_seg_count(sbi, type, gc_type); + + if (gc_type == FG_GC && + get_valid_blocks(sbi, segno, false) == 0) + seg_freed++; next: f2fs_put_page(sum_page, 0); } if (gc_type == FG_GC) - f2fs_submit_merged_bio(sbi, - (type == SUM_TYPE_NODE) ? NODE : DATA, WRITE); + f2fs_submit_merged_write(sbi, + (type == SUM_TYPE_NODE) ? NODE : DATA); blk_finish_plug(&plug); - if (gc_type == FG_GC && - get_valid_blocks(sbi, start_segno, sbi->segs_per_sec) == 0) - sec_freed = 1; - stat_inc_call_count(sbi->stat_info); - return sec_freed; + return seg_freed; } -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, + bool background, unsigned int segno) { - unsigned int segno; int gc_type = sync ? FG_GC : BG_GC; - int sec_freed = 0; - int ret = -EINVAL; + int sec_freed = 0, seg_freed = 0, total_freed = 0; + int ret = 0; struct cp_control cpc; + unsigned int init_segno = segno; struct gc_inode_list gc_list = { .ilist = LIST_HEAD_INIT(gc_list.ilist), .iroot = RADIX_TREE_INIT(GFP_NOFS), }; + trace_f2fs_gc_begin(sbi->sb, sync, background, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + cpc.reason = __get_cp_reason(sbi); gc_more: - segno = NULL_SEGNO; - - if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) + if (unlikely(!(sbi->sb->s_flags & MS_ACTIVE))) { + ret = -EINVAL; goto stop; + } if (unlikely(f2fs_cp_error(sbi))) { ret = -EIO; goto stop; } - if (gc_type == BG_GC && has_not_enough_free_secs(sbi, sec_freed, 0)) { - gc_type = FG_GC; + if (gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) { /* - * If there is no victim and no prefree segment but still not - * enough free sections, we should flush dent/node blocks and do - * garbage collections. + * For example, if there are many prefree_segments below given + * threshold, we can make them free by checkpoint. Then, we + * secure free segments which doesn't need fggc any more. */ - if (__get_victim(sbi, &segno, gc_type) || - prefree_segments(sbi)) { - ret = write_checkpoint(sbi, &cpc); - if (ret) - goto stop; - segno = NULL_SEGNO; - } else if (has_not_enough_free_secs(sbi, 0, 0)) { + if (prefree_segments(sbi)) { ret = write_checkpoint(sbi, &cpc); if (ret) goto stop; } - } else if (gc_type == BG_GC && !background) { - /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (has_not_enough_free_secs(sbi, 0, 0)) + gc_type = FG_GC; + } + + /* f2fs_balance_fs doesn't need to do BG_GC in critical path. */ + if (gc_type == BG_GC && !background) { + ret = -EINVAL; + goto stop; + } + if (!__get_victim(sbi, &segno, gc_type)) { + ret = -ENODATA; goto stop; } - if (segno == NULL_SEGNO && !__get_victim(sbi, &segno, gc_type)) - goto stop; - ret = 0; - - if (do_garbage_collect(sbi, segno, &gc_list, gc_type) && - gc_type == FG_GC) + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + if (gc_type == FG_GC && seg_freed == sbi->segs_per_sec) sec_freed++; + total_freed += seg_freed; if (gc_type == FG_GC) sbi->cur_victim_sec = NULL_SEGNO; if (!sync) { - if (has_not_enough_free_secs(sbi, sec_freed, 0)) + if (has_not_enough_free_secs(sbi, sec_freed, 0)) { + segno = NULL_SEGNO; goto gc_more; + } if (gc_type == FG_GC) ret = write_checkpoint(sbi, &cpc); } stop: + SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0; + SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno; + + trace_f2fs_gc_end(sbi->sb, ret, total_freed, sec_freed, + get_pages(sbi, F2FS_DIRTY_NODES), + get_pages(sbi, F2FS_DIRTY_DENTS), + get_pages(sbi, F2FS_DIRTY_IMETA), + free_sections(sbi), + free_segments(sbi), + reserved_segments(sbi), + prefree_segments(sbi)); + mutex_unlock(&sbi->gc_mutex); put_gc_inode(&gc_list); @@ -989,5 +1081,20 @@ int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background) void build_gc_manager(struct f2fs_sb_info *sbi) { + u64 main_count, resv_count, ovp_count; + DIRTY_I(sbi)->v_ops = &default_v_ops; + + /* threshold of # of valid blocks in a section for victims of FG_GC */ + main_count = SM_I(sbi)->main_segments << sbi->log_blocks_per_seg; + resv_count = SM_I(sbi)->reserved_segments << sbi->log_blocks_per_seg; + ovp_count = SM_I(sbi)->ovp_segments << sbi->log_blocks_per_seg; + + sbi->fggc_threshold = div64_u64((main_count - ovp_count) * + BLKS_PER_SEC(sbi), (main_count - resv_count)); + + /* give warm/cold data area from slower device */ + if (sbi->s_ndevs && sbi->segs_per_sec == 1) + SIT_I(sbi)->last_victim[ALLOC_NEXT] = + GET_SEGNO(sbi, FDEV(0).end_blk) + 1; } diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index a993967dcdb9..9325191fab2d 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -13,6 +13,7 @@ * whether IO subsystem is idle * or not */ +#define DEF_GC_THREAD_URGENT_SLEEP_TIME 500 /* 500 ms */ #define DEF_GC_THREAD_MIN_SLEEP_TIME 30000 /* milliseconds */ #define DEF_GC_THREAD_MAX_SLEEP_TIME 60000 #define DEF_GC_THREAD_NOGC_SLEEP_TIME 300000 /* wait 5 min */ @@ -27,12 +28,15 @@ struct f2fs_gc_kthread { wait_queue_head_t gc_wait_queue_head; /* for gc sleep time */ + unsigned int urgent_sleep_time; unsigned int min_sleep_time; unsigned int max_sleep_time; unsigned int no_gc_sleep_time; /* for changing gc mode */ unsigned int gc_idle; + unsigned int gc_urgent; + unsigned int gc_wake; }; struct gc_inode_list { @@ -65,25 +69,32 @@ static inline block_t limit_free_user_blocks(struct f2fs_sb_info *sbi) } static inline void increase_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + unsigned int max_time = gc_th->max_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) return; - *wait += gc_th->min_sleep_time; - if (*wait > gc_th->max_sleep_time) - *wait = gc_th->max_sleep_time; + if ((long long)*wait + (long long)min_time > (long long)max_time) + *wait = max_time; + else + *wait += min_time; } static inline void decrease_sleep_time(struct f2fs_gc_kthread *gc_th, - long *wait) + unsigned int *wait) { + unsigned int min_time = gc_th->min_sleep_time; + if (*wait == gc_th->no_gc_sleep_time) *wait = gc_th->max_sleep_time; - *wait -= gc_th->min_sleep_time; - if (*wait <= gc_th->min_sleep_time) - *wait = gc_th->min_sleep_time; + if ((long long)*wait - (long long)min_time < (long long)min_time) + *wait = min_time; + else + *wait -= min_time; } static inline bool has_enough_invalid_blocks(struct f2fs_sb_info *sbi) diff --git a/fs/f2fs/hash.c b/fs/f2fs/hash.c index 71b7206c431e..eb2e031ea887 100644 --- a/fs/f2fs/hash.c +++ b/fs/f2fs/hash.c @@ -70,7 +70,8 @@ static void str2hashbuf(const unsigned char *msg, size_t len, *buf++ = pad; } -f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) +f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info, + struct fscrypt_name *fname) { __u32 hash; f2fs_hash_t f2fs_hash; @@ -79,6 +80,10 @@ f2fs_hash_t f2fs_dentry_hash(const struct qstr *name_info) const unsigned char *name = name_info->name; size_t len = name_info->len; + /* encrypted bigname case */ + if (fname && !fname->disk_name.name) + return cpu_to_le32(fname->hash); + if (is_dot_dotdot(name_info)) return 0; diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index fb5d7d1f34aa..fbf22b0f667f 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -23,10 +23,10 @@ bool f2fs_may_inline_data(struct inode *inode) if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode)) return false; - if (i_size_read(inode) > MAX_INLINE_DATA) + if (i_size_read(inode) > MAX_INLINE_DATA(inode)) return false; - if (f2fs_encrypted_inode(inode) && S_ISREG(inode->i_mode)) + if (f2fs_encrypted_file(inode)) return false; return true; @@ -45,6 +45,7 @@ bool f2fs_may_inline_dentry(struct inode *inode) void read_inline_data(struct page *page, struct page *ipage) { + struct inode *inode = page->mapping->host; void *src_addr, *dst_addr; if (PageUptodate(page)) @@ -52,31 +53,33 @@ void read_inline_data(struct page *page, struct page *ipage) f2fs_bug_on(F2FS_P_SB(page), page->index); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(inode), PAGE_SIZE); /* Copy the whole inline data block */ - src_addr = inline_data_addr(ipage); + src_addr = inline_data_addr(inode, ipage); dst_addr = kmap_atomic(page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); flush_dcache_page(page); kunmap_atomic(dst_addr); if (!PageUptodate(page)) SetPageUptodate(page); } -bool truncate_inline_inode(struct page *ipage, u64 from) +void truncate_inline_inode(struct inode *inode, struct page *ipage, u64 from) { void *addr; - if (from >= MAX_INLINE_DATA) - return false; + if (from >= MAX_INLINE_DATA(inode)) + return; - addr = inline_data_addr(ipage); + addr = inline_data_addr(inode, ipage); f2fs_wait_on_page_writeback(ipage, NODE, true); - memset(addr + from, 0, MAX_INLINE_DATA - from); + memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); set_page_dirty(ipage); - return true; + + if (from == 0) + clear_inode_flag(inode, FI_DATA_EXIST); } int f2fs_read_inline_data(struct inode *inode, struct page *page) @@ -132,6 +135,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, .page = page, .encrypted_page = NULL, + .io_type = FS_DATA_IO, }; int dirty, err; @@ -153,6 +157,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) /* write data page to try to make data consistent */ set_page_writeback(page); fio.old_blkaddr = dn->data_blkaddr; + set_inode_flag(dn->inode, FI_HOT_DATA); write_data_page(dn, &fio); f2fs_wait_on_page_writeback(page, DATA, true); if (dirty) { @@ -164,11 +169,11 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - truncate_inline_inode(dn->inode_page, 0); + truncate_inline_inode(dn->inode, dn->inode_page, 0); clear_inline_node(dn->inode_page); clear_out: stat_dec_inline_inode(dn->inode); - f2fs_clear_inline_inode(dn->inode); + clear_inode_flag(dn->inode, FI_INLINE_DATA); f2fs_put_dnode(dn); return 0; } @@ -215,6 +220,8 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) { void *src_addr, *dst_addr; struct dnode_of_data dn; + struct address_space *mapping = page_mapping(page); + unsigned long flags; int err; set_new_dnode(&dn, inode, NULL, NULL, 0); @@ -231,11 +238,16 @@ int f2fs_write_inline_data(struct inode *inode, struct page *page) f2fs_wait_on_page_writeback(dn.inode_page, NODE, true); src_addr = kmap_atomic(page); - dst_addr = inline_data_addr(dn.inode_page); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + dst_addr = inline_data_addr(inode, dn.inode_page); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); kunmap_atomic(src_addr); set_page_dirty(dn.inode_page); + spin_lock_irqsave(&mapping->tree_lock, flags); + radix_tree_tag_clear(&mapping->page_tree, page_index(page), + PAGECACHE_TAG_DIRTY); + spin_unlock_irqrestore(&mapping->tree_lock, flags); + set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); @@ -270,9 +282,9 @@ bool recover_inline_data(struct inode *inode, struct page *npage) f2fs_wait_on_page_writeback(ipage, NODE, true); - src_addr = inline_data_addr(npage); - dst_addr = inline_data_addr(ipage); - memcpy(dst_addr, src_addr, MAX_INLINE_DATA); + src_addr = inline_data_addr(inode, npage); + dst_addr = inline_data_addr(inode, ipage); + memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); @@ -285,9 +297,8 @@ bool recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode)) { ipage = get_node_page(sbi, inode->i_ino); f2fs_bug_on(sbi, IS_ERR(ipage)); - if (!truncate_inline_inode(ipage, 0)) - return false; - f2fs_clear_inline_inode(inode); + truncate_inline_inode(inode, ipage, 0); + clear_inode_flag(inode, FI_INLINE_DATA); f2fs_put_page(ipage, 1); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { if (truncate_blocks(inode, 0, false)) @@ -301,11 +312,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, struct fscrypt_name *fname, struct page **res_page) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); - struct f2fs_inline_dentry *inline_dentry; struct qstr name = FSTR_TO_QSTR(&fname->disk_name); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; struct page *ipage; + void *inline_dentry; f2fs_hash_t namehash; ipage = get_node_page(sbi, dir->i_ino); @@ -314,11 +325,11 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, return NULL; } - namehash = f2fs_dentry_hash(&name); + namehash = f2fs_dentry_hash(&name, fname); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(dir, ipage); - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); de = find_target_dentry(fname, namehash, NULL, &d); unlock_page(ipage); if (de) @@ -332,19 +343,19 @@ struct f2fs_dir_entry *find_in_inline_dir(struct inode *dir, int make_empty_inline_dir(struct inode *inode, struct inode *parent, struct page *ipage) { - struct f2fs_inline_dentry *dentry_blk; struct f2fs_dentry_ptr d; + void *inline_dentry; - dentry_blk = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); do_make_empty_dir(inode, parent, &d); set_page_dirty(ipage); /* update i_size to MAX_INLINE_DATA */ - if (i_size_read(inode) < MAX_INLINE_DATA) - f2fs_i_size_write(inode, MAX_INLINE_DATA); + if (i_size_read(inode) < MAX_INLINE_DATA(inode)) + f2fs_i_size_write(inode, MAX_INLINE_DATA(inode)); return 0; } @@ -353,11 +364,12 @@ int make_empty_inline_dir(struct inode *inode, struct inode *parent, * release ipage in this function. */ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { struct page *page; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_ptr src, dst; int err; page = f2fs_grab_cache_page(dir->i_mapping, 0, false); @@ -372,25 +384,24 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, goto out; f2fs_wait_on_page_writeback(page, DATA, true); - zero_user_segment(page, MAX_INLINE_DATA, PAGE_SIZE); + zero_user_segment(page, MAX_INLINE_DATA(dir), PAGE_SIZE); dentry_blk = kmap_atomic(page); + make_dentry_ptr_inline(dir, &src, inline_dentry); + make_dentry_ptr_block(dir, &dst, dentry_blk); + /* copy data from inline dentry block to new dentry block */ - memcpy(dentry_blk->dentry_bitmap, inline_dentry->dentry_bitmap, - INLINE_DENTRY_BITMAP_SIZE); - memset(dentry_blk->dentry_bitmap + INLINE_DENTRY_BITMAP_SIZE, 0, - SIZE_OF_DENTRY_BITMAP - INLINE_DENTRY_BITMAP_SIZE); + memcpy(dst.bitmap, src.bitmap, src.nr_bitmap); + memset(dst.bitmap + src.nr_bitmap, 0, dst.nr_bitmap - src.nr_bitmap); /* * we do not need to zero out remainder part of dentry and filename * field, since we have used bitmap for marking the usage status of * them, besides, we can also ignore copying/zeroing reserved space * of dentry block, because them haven't been used so far. */ - memcpy(dentry_blk->dentry, inline_dentry->dentry, - sizeof(struct f2fs_dir_entry) * NR_INLINE_DENTRY); - memcpy(dentry_blk->filename, inline_dentry->filename, - NR_INLINE_DENTRY * F2FS_SLOT_LEN); + memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); + memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); kunmap_atomic(dentry_blk); if (!PageUptodate(page)) @@ -398,7 +409,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, set_page_dirty(page); /* clear inline dir and flag after data writeback */ - truncate_inline_inode(ipage, 0); + truncate_inline_inode(dir, ipage, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -411,14 +422,13 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, return err; } -static int f2fs_add_inline_entries(struct inode *dir, - struct f2fs_inline_dentry *inline_dentry) +static int f2fs_add_inline_entries(struct inode *dir, void *inline_dentry) { struct f2fs_dentry_ptr d; unsigned long bit_pos = 0; int err = 0; - make_dentry_ptr(NULL, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(dir, &d, inline_dentry); while (bit_pos < d.max) { struct f2fs_dir_entry *de; @@ -460,20 +470,20 @@ static int f2fs_add_inline_entries(struct inode *dir, } static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { - struct f2fs_inline_dentry *backup_dentry; + void *backup_dentry; int err; backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), - sizeof(struct f2fs_inline_dentry), GFP_F2FS_ZERO); + MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { f2fs_put_page(ipage, 1); return -ENOMEM; } - memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA); - truncate_inline_inode(ipage, 0); + memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); + truncate_inline_inode(dir, ipage, 0); unlock_page(ipage); @@ -489,9 +499,9 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, return 0; recover: lock_page(ipage); - memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA); + memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); - f2fs_i_size_write(dir, MAX_INLINE_DATA); + f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); set_page_dirty(ipage); f2fs_put_page(ipage, 1); @@ -500,7 +510,7 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, } static int f2fs_convert_inline_dir(struct inode *dir, struct page *ipage, - struct f2fs_inline_dentry *inline_dentry) + void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) return f2fs_move_inline_dirents(dir, ipage, inline_dentry); @@ -516,7 +526,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, struct page *ipage; unsigned int bit_pos; f2fs_hash_t name_hash; - struct f2fs_inline_dentry *dentry_blk = NULL; + void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(new_name->len); struct page *page = NULL; @@ -526,11 +536,12 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, if (IS_ERR(ipage)) return PTR_ERR(ipage); - dentry_blk = inline_data_addr(ipage); - bit_pos = room_for_filename(&dentry_blk->dentry_bitmap, - slots, NR_INLINE_DENTRY); - if (bit_pos >= NR_INLINE_DENTRY) { - err = f2fs_convert_inline_dir(dir, ipage, dentry_blk); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = room_for_filename(d.bitmap, slots, d.max); + if (bit_pos >= d.max) { + err = f2fs_convert_inline_dir(dir, ipage, inline_dentry); if (err) return err; err = -EAGAIN; @@ -545,14 +556,11 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, err = PTR_ERR(page); goto fail; } - if (f2fs_encrypted_inode(dir)) - file_set_enc_name(inode); } f2fs_wait_on_page_writeback(ipage, NODE, true); - name_hash = f2fs_dentry_hash(new_name); - make_dentry_ptr(NULL, &d, (void *)dentry_blk, 2); + name_hash = f2fs_dentry_hash(new_name, NULL); f2fs_update_dentry(ino, mode, &d, new_name, name_hash, bit_pos); set_page_dirty(ipage); @@ -575,7 +583,8 @@ int f2fs_add_inline_entry(struct inode *dir, const struct qstr *new_name, void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, struct inode *dir, struct inode *inode) { - struct f2fs_inline_dentry *inline_dentry; + struct f2fs_dentry_ptr d; + void *inline_dentry; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); unsigned int bit_pos; int i; @@ -583,11 +592,12 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, lock_page(page); f2fs_wait_on_page_writeback(page, NODE, true); - inline_dentry = inline_data_addr(page); - bit_pos = dentry - inline_dentry->dentry; + inline_dentry = inline_data_addr(dir, page); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) - __clear_bit_le(bit_pos + i, - &inline_dentry->dentry_bitmap); + __clear_bit_le(bit_pos + i, d.bitmap); set_page_dirty(page); f2fs_put_page(page, 1); @@ -604,20 +614,21 @@ bool f2fs_empty_inline_dir(struct inode *dir) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct page *ipage; unsigned int bit_pos = 2; - struct f2fs_inline_dentry *dentry_blk; + void *inline_dentry; + struct f2fs_dentry_ptr d; ipage = get_node_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; - dentry_blk = inline_data_addr(ipage); - bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, - NR_INLINE_DENTRY, - bit_pos); + inline_dentry = inline_data_addr(dir, ipage); + make_dentry_ptr_inline(dir, &d, inline_dentry); + + bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); f2fs_put_page(ipage, 1); - if (bit_pos < NR_INLINE_DENTRY) + if (bit_pos < d.max) return false; return true; @@ -627,25 +638,27 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct f2fs_inline_dentry *inline_dentry = NULL; struct page *ipage = NULL; struct f2fs_dentry_ptr d; + void *inline_dentry = NULL; int err; - if (ctx->pos == NR_INLINE_DENTRY) + make_dentry_ptr_inline(inode, &d, inline_dentry); + + if (ctx->pos == d.max) return 0; ipage = get_node_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); - inline_dentry = inline_data_addr(ipage); + inline_dentry = inline_data_addr(inode, ipage); - make_dentry_ptr(inode, &d, (void *)inline_dentry, 2); + make_dentry_ptr_inline(inode, &d, inline_dentry); err = f2fs_fill_dentries(ctx, &d, 0, fstr); if (!err) - ctx->pos = NR_INLINE_DENTRY; + ctx->pos = d.max; f2fs_put_page(ipage, 1); return err < 0 ? err : 0; @@ -670,7 +683,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; } - ilen = min_t(size_t, MAX_INLINE_DATA, i_size_read(inode)); + ilen = min_t(size_t, MAX_INLINE_DATA(inode), i_size_read(inode)); if (start >= ilen) goto out; if (start + len < ilen) @@ -679,7 +692,8 @@ int f2fs_inline_data_fiemap(struct inode *inode, get_node_info(F2FS_I_SB(inode), inode->i_ino, &ni); byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(ipage) - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ipage) - + (char *)F2FS_INODE(ipage); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); out: f2fs_put_page(ipage, 1); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index af06bda51a54..50c88e37ed66 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -16,6 +16,7 @@ #include "f2fs.h" #include "node.h" +#include "segment.h" #include @@ -44,25 +45,26 @@ void f2fs_set_inode_flags(struct inode *inode) new_fl |= S_DIRSYNC; inode_set_flags(inode, new_fl, S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC); - f2fs_mark_inode_dirty_sync(inode, false); } static void __get_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { - if (ri->i_addr[0]) - inode->i_rdev = - old_decode_dev(le32_to_cpu(ri->i_addr[0])); + if (ri->i_addr[extra_size]) + inode->i_rdev = old_decode_dev( + le32_to_cpu(ri->i_addr[extra_size])); else - inode->i_rdev = - new_decode_dev(le32_to_cpu(ri->i_addr[1])); + inode->i_rdev = new_decode_dev( + le32_to_cpu(ri->i_addr[extra_size + 1])); } } static bool __written_first_block(struct f2fs_inode *ri) { - block_t addr = le32_to_cpu(ri->i_addr[0]); + block_t addr = le32_to_cpu(ri->i_addr[offset_in_addr(ri)]); if (addr != NEW_ADDR && addr != NULL_ADDR) return true; @@ -71,25 +73,27 @@ static bool __written_first_block(struct f2fs_inode *ri) static void __set_inode_rdev(struct inode *inode, struct f2fs_inode *ri) { + int extra_size = get_extra_isize(inode); + if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { - ri->i_addr[0] = + ri->i_addr[extra_size] = cpu_to_le32(old_encode_dev(inode->i_rdev)); - ri->i_addr[1] = 0; + ri->i_addr[extra_size + 1] = 0; } else { - ri->i_addr[0] = 0; - ri->i_addr[1] = + ri->i_addr[extra_size] = 0; + ri->i_addr[extra_size + 1] = cpu_to_le32(new_encode_dev(inode->i_rdev)); - ri->i_addr[2] = 0; + ri->i_addr[extra_size + 2] = 0; } } } static void __recover_inline_status(struct inode *inode, struct page *ipage) { - void *inline_data = inline_data_addr(ipage); + void *inline_data = inline_data_addr(inode, ipage); __le32 *start = inline_data; - __le32 *end = start + MAX_INLINE_DATA / sizeof(__le32); + __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { @@ -104,12 +108,84 @@ static void __recover_inline_status(struct inode *inode, struct page *ipage) return; } +static bool f2fs_enable_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + int extra_isize = le32_to_cpu(ri->i_extra_isize); + + if (!f2fs_sb_has_inode_chksum(sbi->sb)) + return false; + + if (!RAW_IS_INODE(F2FS_NODE(page)) || !(ri->i_inline & F2FS_EXTRA_ATTR)) + return false; + + if (!F2FS_FITS_IN_INODE(ri, extra_isize, i_inode_checksum)) + return false; + + return true; +} + +static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_node *node = F2FS_NODE(page); + struct f2fs_inode *ri = &node->i; + __le32 ino = node->footer.ino; + __le32 gen = ri->i_generation; + __u32 chksum, chksum_seed; + __u32 dummy_cs = 0; + unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); + unsigned int cs_size = sizeof(dummy_cs); + + chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, + sizeof(ino)); + chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + + chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + offset += cs_size; + chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); + return chksum; +} + +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri; + __u32 provided, calculated; + + if (!f2fs_enable_inode_chksum(sbi, page) || + PageDirty(page) || PageWriteback(page)) + return true; + + ri = &F2FS_NODE(page)->i; + provided = le32_to_cpu(ri->i_inode_checksum); + calculated = f2fs_inode_chksum(sbi, page); + + if (provided != calculated) + f2fs_msg(sbi->sb, KERN_WARNING, + "checksum invalid, ino = %x, %x vs. %x", + ino_of_node(page), provided, calculated); + + return provided == calculated; +} + +void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page) +{ + struct f2fs_inode *ri = &F2FS_NODE(page)->i; + + if (!f2fs_enable_inode_chksum(sbi, page)) + return; + + ri->i_inode_checksum = cpu_to_le32(f2fs_inode_chksum(sbi, page)); +} + static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); struct page *node_page; struct f2fs_inode *ri; + projid_t i_projid; /* Check if ino is within scope */ if (check_nid_range(sbi, inode->i_ino)) { @@ -130,7 +206,7 @@ static int do_read_inode(struct inode *inode) i_gid_write(inode, le32_to_cpu(ri->i_gid)); set_nlink(inode, le32_to_cpu(ri->i_links)); inode->i_size = le64_to_cpu(ri->i_size); - inode->i_blocks = le64_to_cpu(ri->i_blocks); + inode->i_blocks = SECTOR_FROM_BLOCK(le64_to_cpu(ri->i_blocks) - 1); inode->i_atime.tv_sec = le64_to_cpu(ri->i_atime); inode->i_ctime.tv_sec = le64_to_cpu(ri->i_ctime); @@ -153,6 +229,9 @@ static int do_read_inode(struct inode *inode) get_inline_info(inode, ri); + fi->i_extra_isize = f2fs_has_extra_attr(inode) ? + le16_to_cpu(ri->i_extra_isize) : 0; + /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) __recover_inline_status(inode, node_page); @@ -166,6 +245,16 @@ static int do_read_inode(struct inode *inode) if (!need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; + if (fi->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + + if (f2fs_has_extra_attr(inode) && f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(ri, fi->i_extra_isize, i_projid)) + i_projid = (projid_t)le32_to_cpu(ri->i_projid); + else + i_projid = F2FS_DEF_PROJID; + fi->i_projid = make_kprojid(&init_user_ns, i_projid); + f2fs_put_page(node_page, 1); stat_inc_inline_xattr(inode); @@ -226,6 +315,7 @@ struct inode *f2fs_iget(struct super_block *sb, unsigned long ino) ret = -EIO; goto bad_inode; } + f2fs_set_inode_flags(inode); unlock_new_inode(inode); trace_f2fs_iget(inode); return inode; @@ -267,7 +357,7 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_gid = cpu_to_le32(i_gid_read(inode)); ri->i_links = cpu_to_le32(inode->i_nlink); ri->i_size = cpu_to_le64(i_size_read(inode)); - ri->i_blocks = cpu_to_le64(inode->i_blocks); + ri->i_blocks = cpu_to_le64(SECTOR_TO_BLOCK(inode->i_blocks) + 1); if (et) { read_lock(&et->lock); @@ -291,6 +381,20 @@ int update_inode(struct inode *inode, struct page *node_page) ri->i_generation = cpu_to_le32(inode->i_generation); ri->i_dir_level = F2FS_I(inode)->i_dir_level; + if (f2fs_has_extra_attr(inode)) { + ri->i_extra_isize = cpu_to_le16(F2FS_I(inode)->i_extra_isize); + + if (f2fs_sb_has_project_quota(F2FS_I_SB(inode)->sb) && + F2FS_FITS_IN_INODE(ri, F2FS_I(inode)->i_extra_isize, + i_projid)) { + projid_t i_projid; + + i_projid = from_kprojid(&init_user_ns, + F2FS_I(inode)->i_projid); + ri->i_projid = cpu_to_le32(i_projid); + } + } + __set_inode_rdev(inode, ri); set_cold_node(inode, node_page); @@ -316,7 +420,6 @@ int update_inode_page(struct inode *inode) } else if (err != -ENOENT) { f2fs_stop_checkpoint(sbi, false); } - f2fs_inode_synced(inode); return 0; } ret = update_inode(inode, node_page); @@ -339,7 +442,8 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) * We need to balance fs here to prevent from producing dirty node pages * during the urgent cleaning time when runing out of free sections. */ - if (update_inode_page(inode) && wbc && wbc->nr_to_write) + update_inode_page(inode); + if (wbc && wbc->nr_to_write) f2fs_balance_fs(sbi, true); return 0; } @@ -372,10 +476,7 @@ void f2fs_evict_inode(struct inode *inode) if (inode->i_nlink || is_bad_inode(inode)) goto no_delete; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_EVICT_INODE)) - goto no_delete; -#endif + dquot_initialize(inode); remove_ino_entry(sbi, inode->i_ino, APPEND_INO); remove_ino_entry(sbi, inode->i_ino, UPDATE_INO); @@ -387,6 +488,12 @@ void f2fs_evict_inode(struct inode *inode) if (F2FS_HAS_BLOCKS(inode)) err = f2fs_truncate(inode); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (time_to_inject(sbi, FAULT_EVICT_INODE)) { + f2fs_show_injection_info(FAULT_EVICT_INODE); + err = -EIO; + } +#endif if (!err) { f2fs_lock_op(sbi); err = remove_inode_page(inode); @@ -403,13 +510,22 @@ void f2fs_evict_inode(struct inode *inode) if (err) update_inode_page(inode); + dquot_free_inode(inode); sb_end_intwrite(inode->i_sb); no_delete: + dquot_drop(inode); + stat_dec_inline_xattr(inode); stat_dec_inline_dir(inode); stat_dec_inline_inode(inode); - invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, inode->i_ino); + if (!is_set_ckpt_flags(sbi, CP_ERROR_FLAG)) + f2fs_bug_on(sbi, is_inode_flag_set(inode, FI_DIRTY_INODE)); + + /* ino == 0, if f2fs_new_inode() was failed t*/ + if (inode->i_ino) + invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino, + inode->i_ino); if (xnid) invalidate_mapping_pages(NODE_MAPPING(sbi), xnid, xnid); if (inode->i_nlink) { @@ -421,9 +537,10 @@ void f2fs_evict_inode(struct inode *inode) if (is_inode_flag_set(inode, FI_FREE_NID)) { alloc_nid_failed(sbi, inode->i_ino); clear_inode_flag(inode, FI_FREE_NID); + } else { + f2fs_bug_on(sbi, err && + !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); } - f2fs_bug_on(sbi, err && - !exist_written_data(sbi, inode->i_ino, ORPHAN_INO)); out_clear: fscrypt_put_encryption_info(inode, NULL); clear_inode(inode); @@ -446,6 +563,7 @@ void handle_failed_inode(struct inode *inode) * in a panic when flushing dirty inodes in gdirty_list. */ update_inode_page(inode); + f2fs_inode_synced(inode); /* don't make bad inode, since it becomes a regular file. */ unlock_new_inode(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index db3079cd665d..d92b8e9064cb 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -15,6 +15,7 @@ #include #include #include +#include #include "f2fs.h" #include "node.h" @@ -42,6 +43,8 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) } f2fs_unlock_op(sbi); + nid_free = true; + inode_init_owner(inode, dir, mode); inode->i_ino = ino; @@ -52,16 +55,35 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) err = insert_inode_locked(inode); if (err) { err = -EINVAL; - nid_free = true; goto fail; } + if (f2fs_sb_has_project_quota(sbi->sb) && + (F2FS_I(dir)->i_flags & FS_PROJINHERIT_FL)) + F2FS_I(inode)->i_projid = F2FS_I(dir)->i_projid; + else + F2FS_I(inode)->i_projid = make_kprojid(&init_user_ns, + F2FS_DEF_PROJID); + + err = dquot_initialize(inode); + if (err) + goto fail_drop; + + err = dquot_alloc_inode(inode); + if (err) + goto fail_drop; + /* If the directory encrypted, then we should encrypt the inode. */ if (f2fs_encrypted_inode(dir) && f2fs_may_encrypt(inode)) f2fs_set_encrypted_inode(inode); set_inode_flag(inode, FI_NEW_INODE); + if (f2fs_sb_has_extra_attr(sbi->sb)) { + set_inode_flag(inode, FI_EXTRA_ATTR); + F2FS_I(inode)->i_extra_isize = F2FS_TOTAL_EXTRA_ATTR_SIZE; + } + if (test_opt(sbi, INLINE_XATTR)) set_inode_flag(inode, FI_INLINE_XATTR); if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode)) @@ -75,6 +97,15 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) stat_inc_inline_inode(inode); stat_inc_inline_dir(inode); + F2FS_I(inode)->i_flags = + f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED); + + if (S_ISDIR(inode->i_mode)) + F2FS_I(inode)->i_flags |= FS_INDEX_FL; + + if (F2FS_I(inode)->i_flags & FS_PROJINHERIT_FL) + set_inode_flag(inode, FI_PROJ_INHERIT); + trace_f2fs_new_inode(inode, 0); return inode; @@ -85,6 +116,16 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode) set_inode_flag(inode, FI_FREE_NID); iput(inode); return ERR_PTR(err); +fail_drop: + trace_f2fs_new_inode(inode, err); + dquot_drop(inode); + inode->i_flags |= S_NOQUOTA; + if (nid_free) + set_inode_flag(inode, FI_FREE_NID); + clear_nlink(inode); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(err); } static int is_multimedia_file(const unsigned char *s, const char *sub) @@ -136,6 +177,10 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, nid_t ino = 0; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -148,8 +193,6 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, inode->i_mapping->a_ops = &f2fs_dblock_aops; ino = inode->i_ino; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -163,6 +206,8 @@ static int f2fs_create(struct inode *dir, struct dentry *dentry, umode_t mode, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -180,6 +225,15 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, !fscrypt_has_permitted_context(dir, inode)) return -EPERM; + if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); inode->i_ctime = current_time(inode); @@ -233,6 +287,10 @@ static int __recover_dot_dentries(struct inode *dir, nid_t pino) return 0; } + err = dquot_initialize(dir); + if (err) + return err; + f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); @@ -324,9 +382,10 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, if (f2fs_encrypted_inode(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { - bool nokey = f2fs_encrypted_inode(inode) && - !fscrypt_has_encryption_key(inode); - err = nokey ? -ENOKEY : -EPERM; + f2fs_msg(inode->i_sb, KERN_WARNING, + "Inconsistent encryption contexts: %lu/%lu", + dir->i_ino, inode->i_ino); + err = -EPERM; goto err_out; } return d_splice_alias(inode, dentry); @@ -346,6 +405,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) trace_f2fs_unlink_enter(dir, dentry); + err = dquot_initialize(dir); + if (err) + return err; + de = f2fs_find_entry(dir, &dentry->d_name, &page); if (!de) { if (IS_ERR(page)) @@ -400,7 +463,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, return err; if (!fscrypt_has_encryption_key(dir)) - return -EPERM; + return -ENOKEY; disk_link.len = (fscrypt_fname_encrypted_size(dir, len) + sizeof(struct fscrypt_symlink_data)); @@ -409,6 +472,10 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, if (disk_link.len > dir->i_sb->s_blocksize) return -ENAMETOOLONG; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFLNK | S_IRWXUGO); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -420,8 +487,6 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, inode_nohighmem(inode); inode->i_mapping->a_ops = &f2fs_dblock_aops; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -444,7 +509,7 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, goto err_out; if (!fscrypt_has_encryption_key(inode)) { - err = -EPERM; + err = -ENOKEY; goto err_out; } @@ -484,6 +549,8 @@ static int f2fs_symlink(struct inode *dir, struct dentry *dentry, } kfree(sd); + + f2fs_balance_fs(sbi, true); return err; out: handle_failed_inode(inode); @@ -496,6 +563,10 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -505,8 +576,6 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_mapping->a_ops = &f2fs_dblock_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_F2FS_HIGH_ZERO); - f2fs_balance_fs(sbi, true); - set_inode_flag(inode, FI_INC_LINK); f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); @@ -521,6 +590,8 @@ static int f2fs_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out_fail: @@ -544,6 +615,10 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, struct inode *inode; int err = 0; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -551,8 +626,6 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &f2fs_special_inode_operations; - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = f2fs_add_link(dentry, inode); if (err) @@ -566,6 +639,8 @@ static int f2fs_mknod(struct inode *dir, struct dentry *dentry, if (IS_DIRSYNC(dir)) f2fs_sync_fs(sbi->sb, 1); + + f2fs_balance_fs(sbi, true); return 0; out: handle_failed_inode(inode); @@ -579,6 +654,10 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, struct inode *inode; int err; + err = dquot_initialize(dir); + if (err) + return err; + inode = f2fs_new_inode(dir, mode); if (IS_ERR(inode)) return PTR_ERR(inode); @@ -592,8 +671,6 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, inode->i_mapping->a_ops = &f2fs_dblock_aops; } - f2fs_balance_fs(sbi, true); - f2fs_lock_op(sbi); err = acquire_orphan_inode(sbi); if (err) @@ -619,6 +696,8 @@ static int __f2fs_tmpfile(struct inode *dir, struct dentry *dentry, /* link_count was changed by d_tmpfile as well. */ f2fs_unlock_op(sbi); unlock_new_inode(inode); + + f2fs_balance_fs(sbi, true); return 0; release_out: @@ -672,6 +751,19 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; } + if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + (!projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -717,13 +809,6 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, if (err) goto put_out_dir; - err = update_dent_inode(old_inode, new_inode, - &new_dentry->d_name); - if (err) { - release_orphan_inode(sbi); - goto put_out_dir; - } - f2fs_set_link(new_dir, new_entry, new_page, old_inode); new_inode->i_ctime = current_time(new_inode); @@ -775,9 +860,10 @@ static int f2fs_rename(struct inode *old_dir, struct dentry *old_dentry, } down_write(&F2FS_I(old_inode)->i_sem); - file_lost_pino(old_inode); - if (new_inode && file_enc_name(new_inode)) - file_set_enc_name(old_inode); + if (!old_dir_entry || whiteout) + file_lost_pino(old_inode); + else + F2FS_I(old_inode)->i_pino = new_dir->i_ino; up_write(&F2FS_I(old_inode)->i_sem); old_inode->i_ctime = current_time(old_inode); @@ -858,6 +944,22 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, !fscrypt_has_permitted_context(old_dir, new_inode))) return -EPERM; + if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(new_dir)->i_projid, + F2FS_I(old_dentry->d_inode)->i_projid)) || + (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + !projid_eq(F2FS_I(old_dir)->i_projid, + F2FS_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + + err = dquot_initialize(old_dir); + if (err) + goto out; + + err = dquot_initialize(new_dir); + if (err) + goto out; + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); if (!old_entry) { if (IS_ERR(old_page)) @@ -905,8 +1007,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, old_nlink = old_dir_entry ? -1 : 1; new_nlink = -old_nlink; err = -EMLINK; - if ((old_nlink > 0 && old_inode->i_nlink >= F2FS_LINK_MAX) || - (new_nlink > 0 && new_inode->i_nlink >= F2FS_LINK_MAX)) + if ((old_nlink > 0 && old_dir->i_nlink >= F2FS_LINK_MAX) || + (new_nlink > 0 && new_dir->i_nlink >= F2FS_LINK_MAX)) goto out_new_dir; } @@ -914,18 +1016,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_lock_op(sbi); - err = update_dent_inode(old_inode, new_inode, &new_dentry->d_name); - if (err) - goto out_unlock; - if (file_enc_name(new_inode)) - file_set_enc_name(old_inode); - - err = update_dent_inode(new_inode, old_inode, &old_dentry->d_name); - if (err) - goto out_undo; - if (file_enc_name(old_inode)) - file_set_enc_name(new_inode); - /* update ".." directory entry info of old dentry */ if (old_dir_entry) f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); @@ -941,7 +1031,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(old_inode); up_write(&F2FS_I(old_inode)->i_sem); - old_dir->i_ctime = CURRENT_TIME; + old_dir->i_ctime = current_time(old_dir); if (old_nlink) { down_write(&F2FS_I(old_dir)->i_sem); f2fs_i_links_write(old_dir, old_nlink > 0); @@ -956,7 +1046,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, file_lost_pino(new_inode); up_write(&F2FS_I(new_inode)->i_sem); - new_dir->i_ctime = CURRENT_TIME; + new_dir->i_ctime = current_time(new_dir); if (new_nlink) { down_write(&F2FS_I(new_dir)->i_sem); f2fs_i_links_write(new_dir, new_nlink > 0); @@ -969,14 +1059,6 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (IS_DIRSYNC(old_dir) || IS_DIRSYNC(new_dir)) f2fs_sync_fs(sbi->sb, 1); return 0; -out_undo: - /* - * Still we may fail to recover name info of f2fs_inode here - * Drop it, once its name is set as encrypted - */ - update_dent_inode(old_inode, old_inode, &old_dentry->d_name); -out_unlock: - f2fs_unlock_op(sbi); out_new_dir: if (new_dir_entry) { f2fs_dentry_kunmap(new_inode, new_dir_page); diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index bc67dc323f7e..32474db18ad9 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -19,10 +19,11 @@ #include "f2fs.h" #include "node.h" #include "segment.h" +#include "xattr.h" #include "trace.h" #include -#define on_build_free_nids(nmi) mutex_is_locked(&nm_i->build_lock) +#define on_build_free_nids(nmi) mutex_is_locked(&(nm_i)->build_lock) static struct kmem_cache *nat_entry_slab; static struct kmem_cache *free_nid_slab; @@ -63,8 +64,9 @@ bool available_free_memory(struct f2fs_sb_info *sbi, int type) int i; for (i = 0; i <= UPDATE_INO; i++) - mem_size += (sbi->im[i].ino_num * - sizeof(struct ino_entry)) >> PAGE_SHIFT; + mem_size += sbi->im[i].ino_num * + sizeof(struct ino_entry); + mem_size >>= PAGE_SHIFT; res = mem_size < ((avail_ram * nm_i->ram_thresh / 100) >> 1); } else if (type == EXTENT_CACHE) { mem_size = (atomic_read(&sbi->total_ext_tree) * @@ -157,9 +159,6 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); struct nat_entry_set *head; - if (get_nat_flag(ne, IS_DIRTY)) - return; - head = radix_tree_lookup(&nm_i->nat_set_root, set); if (!head) { head = f2fs_kmem_cache_alloc(nat_entry_set_slab, GFP_NOFS); @@ -170,25 +169,27 @@ static void __set_nat_cache_dirty(struct f2fs_nm_info *nm_i, head->entry_cnt = 0; f2fs_radix_tree_insert(&nm_i->nat_set_root, set, head); } - list_move_tail(&ne->list, &head->entry_list); + + if (get_nat_flag(ne, IS_DIRTY)) + goto refresh_list; + nm_i->dirty_nat_cnt++; head->entry_cnt++; set_nat_flag(ne, IS_DIRTY, true); +refresh_list: + if (nat_get_blkaddr(ne) == NEW_ADDR) + list_del_init(&ne->list); + else + list_move_tail(&ne->list, &head->entry_list); } static void __clear_nat_cache_dirty(struct f2fs_nm_info *nm_i, - struct nat_entry *ne) + struct nat_entry_set *set, struct nat_entry *ne) { - nid_t set = NAT_BLOCK_OFFSET(ne->ni.nid); - struct nat_entry_set *head; - - head = radix_tree_lookup(&nm_i->nat_set_root, set); - if (head) { - list_move_tail(&ne->list, &nm_i->nat_entries); - set_nat_flag(ne, IS_DIRTY, false); - head->entry_cnt--; - nm_i->dirty_nat_cnt--; - } + list_move_tail(&ne->list, &nm_i->nat_entries); + set_nat_flag(ne, IS_DIRTY, false); + set->entry_cnt--; + nm_i->dirty_nat_cnt--; } static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, @@ -381,6 +382,7 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) struct page *page = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; + pgoff_t index; int i; ni->nid = nid; @@ -406,17 +408,21 @@ void get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct node_info *ni) node_info_from_raw_nat(ni, &ne); } up_read(&curseg->journal_rwsem); - if (i >= 0) + if (i >= 0) { + up_read(&nm_i->nat_tree_lock); goto cache; + } /* Fill node_info from nat page */ - page = get_current_nat_page(sbi, start_nid); + index = current_nat_addr(sbi, nid); + up_read(&nm_i->nat_tree_lock); + + page = get_meta_page(sbi, index); nat_blk = (struct f2fs_nat_block *)page_address(page); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); f2fs_put_page(page, 1); cache: - up_read(&nm_i->nat_tree_lock); /* cache nat entry */ down_write(&nm_i->nat_tree_lock); cache_nat_entry(sbi, nid, &ne); @@ -549,7 +555,7 @@ static int get_node_path(struct inode *inode, long block, level = 3; goto got; } else { - BUG(); + return -E2BIG; } got: return level; @@ -573,6 +579,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) int err = 0; level = get_node_path(dn->inode, index, offset, noffset); + if (level < 0) + return level; nids[0] = dn->inode->i_ino; npage[0] = dn->inode_page; @@ -608,7 +616,7 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = new_node_page(dn, noffset[i], NULL); + npage[i] = new_node_page(dn, noffset[i]); if (IS_ERR(npage[i])) { alloc_nid_failed(sbi, nids[i]); err = PTR_ERR(npage[i]); @@ -649,7 +657,8 @@ int get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) dn->nid = nids[level]; dn->ofs_in_node = offset[level]; dn->node_page = npage[level]; - dn->data_blkaddr = datablock_addr(dn->node_page, dn->ofs_in_node); + dn->data_blkaddr = datablock_addr(dn->inode, + dn->node_page, dn->ofs_in_node); return 0; release_pages: @@ -673,15 +682,11 @@ static void truncate_node(struct dnode_of_data *dn) struct node_info ni; get_node_info(sbi, dn->nid, &ni); - if (dn->inode->i_blocks == 0) { - f2fs_bug_on(sbi, ni.blk_addr != NULL_ADDR); - goto invalidate; - } f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); /* Deallocate node address */ invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, dn->inode); + dec_valid_node_count(sbi, dn->inode, dn->nid == dn->inode->i_ino); set_node_addr(sbi, &ni, NULL_ADDR, false); if (dn->nid == dn->inode->i_ino) { @@ -689,7 +694,7 @@ static void truncate_node(struct dnode_of_data *dn) dec_valid_inode_count(sbi); f2fs_inode_synced(dn->inode); } -invalidate: + clear_node_page_dirty(dn->node_page); set_sbi_flag(sbi, SBI_IS_DIRTY); @@ -875,6 +880,8 @@ int truncate_inode_blocks(struct inode *inode, pgoff_t from) trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); + if (level < 0) + return level; page = get_node_page(sbi, inode->i_ino); if (IS_ERR(page)) { @@ -971,9 +978,6 @@ int truncate_xattr_node(struct inode *inode, struct page *page) f2fs_i_xnid_write(inode, 0); - /* need to do checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); - set_new_dnode(&dn, inode, page, npage, nid); if (page) @@ -1009,7 +1013,7 @@ int remove_inode_page(struct inode *inode) /* 0 is possible, after f2fs_new_inode() has failed */ f2fs_bug_on(F2FS_I_SB(inode), - inode->i_blocks != 0 && inode->i_blocks != 1); + inode->i_blocks != 0 && inode->i_blocks != 8); /* will put inode & node pages */ truncate_node(&dn); @@ -1024,14 +1028,13 @@ struct page *new_inode_page(struct inode *inode) set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); /* caller should f2fs_put_page(page, 1); */ - return new_node_page(&dn, 0, NULL); + return new_node_page(&dn, 0); } -struct page *new_node_page(struct dnode_of_data *dn, - unsigned int ofs, struct page *ipage) +struct page *new_node_page(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct node_info old_ni, new_ni; + struct node_info new_ni; struct page *page; int err; @@ -1042,17 +1045,18 @@ struct page *new_node_page(struct dnode_of_data *dn, if (!page) return ERR_PTR(-ENOMEM); - if (unlikely(!inc_valid_node_count(sbi, dn->inode))) { - err = -ENOSPC; + if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; - } - get_node_info(sbi, dn->nid, &old_ni); - - /* Reinitialize old_ni with new node page */ - f2fs_bug_on(sbi, old_ni.blk_addr != NULL_ADDR); - new_ni = old_ni; +#ifdef CONFIG_F2FS_CHECK_FS + get_node_info(sbi, dn->nid, &new_ni); + f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR); +#endif + new_ni.nid = dn->nid; new_ni.ino = dn->inode->i_ino; + new_ni.blk_addr = NULL_ADDR; + new_ni.flag = 0; + new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); f2fs_wait_on_page_writeback(page, NODE, true); @@ -1153,6 +1157,7 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, f2fs_put_page(page, 1); return ERR_PTR(err); } else if (err == LOCKED_PAGE) { + err = 0; goto page_hit; } @@ -1166,15 +1171,27 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, goto repeat; } - if (unlikely(!PageUptodate(page))) + if (unlikely(!PageUptodate(page))) { + err = -EIO; goto out_err; + } + + if (!f2fs_inode_chksum_verify(sbi, page)) { + err = -EBADMSG; + goto out_err; + } page_hit: if(unlikely(nid != nid_of_node(page))) { - f2fs_bug_on(sbi, 1); - ClearPageUptodate(page); + f2fs_msg(sbi->sb, KERN_WARNING, "inconsistent node block, " + "nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + err = -EINVAL; out_err: + ClearPageUptodate(page); f2fs_put_page(page, 1); - return ERR_PTR(-EIO); + return ERR_PTR(err); } return page; } @@ -1318,16 +1335,103 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) return last_page; } +static int __write_node_page(struct page *page, bool atomic, bool *submitted, + struct writeback_control *wbc, bool do_balance, + enum iostat_type io_type) +{ + struct f2fs_sb_info *sbi = F2FS_P_SB(page); + nid_t nid; + struct node_info ni; + struct f2fs_io_info fio = { + .sbi = sbi, + .type = NODE, + .op = REQ_OP_WRITE, + .op_flags = wbc_to_write_flags(wbc), + .page = page, + .encrypted_page = NULL, + .submitted = false, + .io_type = io_type, + }; + + trace_f2fs_writepage(page, NODE); + + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto redirty_out; + if (unlikely(f2fs_cp_error(sbi))) + goto redirty_out; + + /* get old block addr of this node page */ + nid = nid_of_node(page); + f2fs_bug_on(sbi, page->index != nid); + + if (wbc->for_reclaim) { + if (!down_read_trylock(&sbi->node_write)) + goto redirty_out; + } else { + down_read(&sbi->node_write); + } + + get_node_info(sbi, nid, &ni); + + /* This page is already truncated */ + if (unlikely(ni.blk_addr == NULL_ADDR)) { + ClearPageUptodate(page); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + unlock_page(page); + return 0; + } + + if (atomic && !test_opt(sbi, NOBARRIER)) + fio.op_flags |= WRITE_FLUSH_FUA; + + set_page_writeback(page); + fio.old_blkaddr = ni.blk_addr; + write_node_page(nid, &fio); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + dec_page_count(sbi, F2FS_DIRTY_NODES); + up_read(&sbi->node_write); + + if (wbc->for_reclaim) { + f2fs_submit_merged_write_cond(sbi, page->mapping->host, 0, + page->index, NODE); + submitted = NULL; + } + + unlock_page(page); + + if (unlikely(f2fs_cp_error(sbi))) { + f2fs_submit_merged_write(sbi, NODE); + submitted = NULL; + } + if (submitted) + *submitted = fio.submitted; + + if (do_balance) + f2fs_balance_fs(sbi, false); + return 0; + +redirty_out: + redirty_page_for_writepage(wbc, page); + return AOP_WRITEPAGE_ACTIVATE; +} + +static int f2fs_write_node_page(struct page *page, + struct writeback_control *wbc) +{ + return __write_node_page(page, false, NULL, wbc, false, FS_NODE_IO); +} + int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic) { pgoff_t index, end; + pgoff_t last_idx = ULONG_MAX; struct pagevec pvec; int ret = 0; struct page *last_page = NULL; bool marked = false; nid_t ino = inode->i_ino; - int nwritten = 0; if (atomic) { last_page = last_fsync_dnode(sbi, ino); @@ -1349,6 +1453,7 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { f2fs_put_page(last_page, 0); @@ -1380,6 +1485,9 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, f2fs_wait_on_page_writeback(page, NODE, true); BUG_ON(PageWriteback(page)); + set_fsync_mark(page, 0); + set_dentry_mark(page, 0); + if (!atomic || page == last_page) { set_fsync_mark(page, 1); if (IS_INODE(page)) { @@ -1397,13 +1505,16 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, if (!clear_page_dirty_for_io(page)) goto continue_unlock; - ret = NODE_MAPPING(sbi)->a_ops->writepage(page, wbc); + ret = __write_node_page(page, atomic && + page == last_page, + &submitted, wbc, true, + FS_NODE_IO); if (ret) { unlock_page(page); f2fs_put_page(last_page, 0); break; - } else { - nwritten++; + } else if (submitted) { + last_idx = page->index; } if (page == last_page) { @@ -1429,12 +1540,13 @@ int fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, goto retry; } out: - if (nwritten) - f2fs_submit_merged_bio_cond(sbi, NULL, NULL, ino, NODE, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, NULL, ino, last_idx, NODE); return ret ? -EIO: 0; } -int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) +int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc, + bool do_balance, enum iostat_type io_type) { pgoff_t index, end; struct pagevec pvec; @@ -1458,6 +1570,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) for (i = 0; i < nr_pages; i++) { struct page *page = pvec.pages[i]; + bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { pagevec_release(&pvec); @@ -1511,9 +1624,11 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) set_fsync_mark(page, 0); set_dentry_mark(page, 0); - if (NODE_MAPPING(sbi)->a_ops->writepage(page, wbc)) + ret = __write_node_page(page, false, &submitted, + wbc, do_balance, io_type); + if (ret) unlock_page(page); - else + else if (submitted) nwritten++; if (--wbc->nr_to_write == 0) @@ -1534,7 +1649,7 @@ int sync_node_pages(struct f2fs_sb_info *sbi, struct writeback_control *wbc) } out: if (nwritten) - f2fs_submit_merged_bio(sbi, NODE, WRITE); + f2fs_submit_merged_write(sbi, NODE); return ret; } @@ -1580,72 +1695,6 @@ int wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, nid_t ino) return ret; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - nid_t nid; - struct node_info ni; - struct f2fs_io_info fio = { - .sbi = sbi, - .type = NODE, - .op = REQ_OP_WRITE, - .op_flags = wbc_to_write_flags(wbc), - .page = page, - .encrypted_page = NULL, - }; - - trace_f2fs_writepage(page, NODE); - - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - goto redirty_out; - if (unlikely(f2fs_cp_error(sbi))) - goto redirty_out; - - /* get old block addr of this node page */ - nid = nid_of_node(page); - f2fs_bug_on(sbi, page->index != nid); - - if (wbc->for_reclaim) { - if (!down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - down_read(&sbi->node_write); - } - - get_node_info(sbi, nid, &ni); - - /* This page is already truncated */ - if (unlikely(ni.blk_addr == NULL_ADDR)) { - ClearPageUptodate(page); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - unlock_page(page); - return 0; - } - - set_page_writeback(page); - fio.old_blkaddr = ni.blk_addr; - write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); - dec_page_count(sbi, F2FS_DIRTY_NODES); - up_read(&sbi->node_write); - - if (wbc->for_reclaim) - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, NODE, WRITE); - - unlock_page(page); - - if (unlikely(f2fs_cp_error(sbi))) - f2fs_submit_merged_bio(sbi, NODE, WRITE); - - return 0; - -redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; -} - static int f2fs_write_node_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -1653,6 +1702,9 @@ static int f2fs_write_node_pages(struct address_space *mapping, struct blk_plug plug; long diff; + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + goto skip_write; + /* balancing f2fs's metadata in background */ f2fs_balance_fs_bg(sbi); @@ -1665,7 +1717,7 @@ static int f2fs_write_node_pages(struct address_space *mapping, diff = nr_pages_to_write(sbi, NODE, wbc); wbc->sync_mode = WB_SYNC_NONE; blk_start_plug(&plug); - sync_node_pages(sbi, wbc); + sync_node_pages(sbi, wbc, true, FS_NODE_IO); blk_finish_plug(&plug); wbc->nr_to_write = max((long)0, wbc->nr_to_write - diff); return 0; @@ -1743,43 +1795,71 @@ static void __remove_nid_from_list(struct f2fs_sb_info *sbi, radix_tree_delete(&nm_i->free_nid_root, i->nid); } -static int add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) +/* return if the nid is recognized as free */ +static bool add_free_nid(struct f2fs_sb_info *sbi, nid_t nid, bool build) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct free_nid *i; + struct free_nid *i, *e; struct nat_entry *ne; - int err; + int err = -EINVAL; + bool ret = false; /* 0 nid should not be used */ if (unlikely(nid == 0)) - return 0; - - if (build) { - /* do not add allocated nids */ - ne = __lookup_nat_cache(nm_i, nid); - if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || - nat_get_blkaddr(ne) != NULL_ADDR)) - return 0; - } + return false; i = f2fs_kmem_cache_alloc(free_nid_slab, GFP_NOFS); i->nid = nid; i->state = NID_NEW; - if (radix_tree_preload(GFP_NOFS)) { - kmem_cache_free(free_nid_slab, i); - return 0; - } + if (radix_tree_preload(GFP_NOFS)) + goto err; spin_lock(&nm_i->nid_list_lock); + + if (build) { + /* + * Thread A Thread B + * - f2fs_create + * - f2fs_new_inode + * - alloc_nid + * - __insert_nid_to_list(ALLOC_NID_LIST) + * - f2fs_balance_fs_bg + * - build_free_nids + * - __build_free_nids + * - scan_nat_page + * - add_free_nid + * - __lookup_nat_cache + * - f2fs_add_link + * - init_inode_metadata + * - new_inode_page + * - new_node_page + * - set_node_addr + * - alloc_nid_done + * - __remove_nid_from_list(ALLOC_NID_LIST) + * - __insert_nid_to_list(FREE_NID_LIST) + */ + ne = __lookup_nat_cache(nm_i, nid); + if (ne && (!get_nat_flag(ne, IS_CHECKPOINTED) || + nat_get_blkaddr(ne) != NULL_ADDR)) + goto err_out; + + e = __lookup_free_nid_list(nm_i, nid); + if (e) { + if (e->state == NID_NEW) + ret = true; + goto err_out; + } + } + ret = true; err = __insert_nid_to_list(sbi, i, FREE_NID_LIST, true); +err_out: spin_unlock(&nm_i->nid_list_lock); radix_tree_preload_end(); - if (err) { +err: + if (err) kmem_cache_free(free_nid_slab, i); - return 0; - } - return 1; + return ret; } static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) @@ -1800,17 +1880,45 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) kmem_cache_free(free_nid_slab, i); } +static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, + bool set, bool build) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_ofs = NAT_BLOCK_OFFSET(nid); + unsigned int nid_ofs = nid - START_NID(nid); + + if (!test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + if (set) + __set_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + else + __clear_bit_le(nid_ofs, nm_i->free_nid_bitmap[nat_ofs]); + + if (set) + nm_i->free_nid_count[nat_ofs]++; + else if (!build) + nm_i->free_nid_count[nat_ofs]--; +} + static void scan_nat_page(struct f2fs_sb_info *sbi, struct page *nat_page, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; + unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; + if (test_bit_le(nat_ofs, nm_i->nat_block_bitmap)) + return; + + __set_bit_le(nat_ofs, nm_i->nat_block_bitmap); + i = start_nid % NAT_ENTRY_PER_BLOCK; for (; i < NAT_ENTRY_PER_BLOCK; i++, start_nid++) { + bool freed = false; if (unlikely(start_nid >= nm_i->max_nid)) break; @@ -1818,11 +1926,58 @@ static void scan_nat_page(struct f2fs_sb_info *sbi, blk_addr = le32_to_cpu(nat_blk->entries[i].block_addr); f2fs_bug_on(sbi, blk_addr == NEW_ADDR); if (blk_addr == NULL_ADDR) - add_free_nid(sbi, start_nid, true); + freed = add_free_nid(sbi, start_nid, true); + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, start_nid, freed, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); } } -static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) +static void scan_free_nid_bits(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); + struct f2fs_journal *journal = curseg->journal; + unsigned int i, idx; + + down_read(&nm_i->nat_tree_lock); + + for (i = 0; i < nm_i->nat_blocks; i++) { + if (!test_bit_le(i, nm_i->nat_block_bitmap)) + continue; + if (!nm_i->free_nid_count[i]) + continue; + for (idx = 0; idx < NAT_ENTRY_PER_BLOCK; idx++) { + nid_t nid; + + if (!test_bit_le(idx, nm_i->free_nid_bitmap[i])) + continue; + + nid = i * NAT_ENTRY_PER_BLOCK + idx; + add_free_nid(sbi, nid, true); + + if (nm_i->nid_cnt[FREE_NID_LIST] >= MAX_FREE_NIDS) + goto out; + } + } +out: + down_read(&curseg->journal_rwsem); + for (i = 0; i < nats_in_cursum(journal); i++) { + block_t addr; + nid_t nid; + + addr = le32_to_cpu(nat_in_journal(journal, i).block_addr); + nid = le32_to_cpu(nid_in_journal(journal, i)); + if (addr == NULL_ADDR) + add_free_nid(sbi, nid, true); + else + remove_free_nid(sbi, nid); + } + up_read(&curseg->journal_rwsem); + up_read(&nm_i->nat_tree_lock); +} + +static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -1830,6 +1985,9 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) int i = 0; nid_t nid = nm_i->next_scan_nid; + if (unlikely(nid >= nm_i->max_nid)) + nid = 0; + /* Enough entries */ if (nm_i->nid_cnt[FREE_NID_LIST] >= NAT_ENTRY_PER_BLOCK) return; @@ -1837,6 +1995,14 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) if (!sync && !available_free_memory(sbi, FREE_NIDS)) return; + if (!mount) { + /* try to find free nids in free_nid_bitmap */ + scan_free_nid_bits(sbi); + + if (nm_i->nid_cnt[FREE_NID_LIST]) + return; + } + /* readahead nat pages to be scanned */ ra_meta_pages(sbi, NAT_BLOCK_OFFSET(nid), FREE_NID_PAGES, META_NAT, true); @@ -1879,10 +2045,10 @@ static void __build_free_nids(struct f2fs_sb_info *sbi, bool sync) nm_i->ra_nid_pages, META_NAT, false); } -void build_free_nids(struct f2fs_sb_info *sbi, bool sync) +void build_free_nids(struct f2fs_sb_info *sbi, bool sync, bool mount) { mutex_lock(&NM_I(sbi)->build_lock); - __build_free_nids(sbi, sync); + __build_free_nids(sbi, sync, mount); mutex_unlock(&NM_I(sbi)->build_lock); } @@ -1897,8 +2063,10 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) struct free_nid *i = NULL; retry: #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_ALLOC_NID)) + if (time_to_inject(sbi, FAULT_ALLOC_NID)) { + f2fs_show_injection_info(FAULT_ALLOC_NID); return false; + } #endif spin_lock(&nm_i->nid_list_lock); @@ -1918,13 +2086,16 @@ bool alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid) i->state = NID_ALLOC; __insert_nid_to_list(sbi, i, ALLOC_NID_LIST, false); nm_i->available_nids--; + + update_free_nid_bitmap(sbi, *nid, false, false); + spin_unlock(&nm_i->nid_list_lock); return true; } spin_unlock(&nm_i->nid_list_lock); /* Let's scan nat pages and its caches to get free nids */ - build_free_nids(sbi, true); + build_free_nids(sbi, true, false); goto retry; } @@ -1972,6 +2143,8 @@ void alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid) nm_i->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&nm_i->nid_list_lock); if (need_free) @@ -2034,38 +2207,47 @@ void recover_inline_xattr(struct inode *inode, struct page *page) f2fs_put_page(ipage, 1); } -void recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) +int recover_xattr_data(struct inode *inode, struct page *page, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t prev_xnid = F2FS_I(inode)->i_xattr_nid; - nid_t new_xnid = nid_of_node(page); + nid_t new_xnid; + struct dnode_of_data dn; struct node_info ni; + struct page *xpage; - /* 1: invalidate the previous xattr nid */ if (!prev_xnid) goto recover_xnid; - /* Deallocate node address */ + /* 1: invalidate the previous xattr nid */ get_node_info(sbi, prev_xnid, &ni); f2fs_bug_on(sbi, ni.blk_addr == NULL_ADDR); invalidate_blocks(sbi, ni.blk_addr); - dec_valid_node_count(sbi, inode); + dec_valid_node_count(sbi, inode, false); set_node_addr(sbi, &ni, NULL_ADDR, false); recover_xnid: - /* 2: allocate new xattr nid */ - if (unlikely(!inc_valid_node_count(sbi, inode))) - f2fs_bug_on(sbi, 1); + /* 2: update xattr nid in inode */ + if (!alloc_nid(sbi, &new_xnid)) + return -ENOSPC; - remove_free_nid(sbi, new_xnid); - get_node_info(sbi, new_xnid, &ni); - ni.ino = inode->i_ino; - set_node_addr(sbi, &ni, NEW_ADDR, false); - f2fs_i_xnid_write(inode, new_xnid); + set_new_dnode(&dn, inode, NULL, NULL, new_xnid); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xpage)) { + alloc_nid_failed(sbi, new_xnid); + return PTR_ERR(xpage); + } - /* 3: update xattr blkaddr */ - refresh_sit_entry(sbi, NEW_ADDR, blkaddr); - set_node_addr(sbi, &ni, blkaddr, false); + alloc_nid_done(sbi, new_xnid); + update_inode_page(inode); + + /* 3: update and set xattr node page dirty */ + memcpy(F2FS_NODE(xpage), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); + + set_page_dirty(xpage); + f2fs_put_page(xpage, 1); + + return 0; } int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) @@ -2101,12 +2283,19 @@ int recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) dst->i_blocks = cpu_to_le64(1); dst->i_links = cpu_to_le32(1); dst->i_xattr_nid = 0; - dst->i_inline = src->i_inline & F2FS_INLINE_XATTR; + dst->i_inline = src->i_inline & (F2FS_INLINE_XATTR | F2FS_EXTRA_ATTR); + if (dst->i_inline & F2FS_EXTRA_ATTR) { + dst->i_extra_isize = src->i_extra_isize; + if (f2fs_sb_has_project_quota(sbi->sb) && + F2FS_FITS_IN_INODE(src, le16_to_cpu(src->i_extra_isize), + i_projid)) + dst->i_projid = src->i_projid; + } new_ni = old_ni; new_ni.ino = ino; - if (unlikely(!inc_valid_node_count(sbi, NULL))) + if (unlikely(inc_valid_node_count(sbi, NULL, true))) WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); @@ -2208,8 +2397,39 @@ static void __adjust_nat_entry_set(struct nat_entry_set *nes, list_add_tail(&nes->set_list, head); } +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, + struct page *page) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_index = start_nid / NAT_ENTRY_PER_BLOCK; + struct f2fs_nat_block *nat_blk = page_address(page); + int valid = 0; + int i; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < NAT_ENTRY_PER_BLOCK; i++) { + if (start_nid == 0 && i == 0) + valid++; + if (nat_blk->entries[i].block_addr) + valid++; + } + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; + } + + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); +} + static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, - struct nat_entry_set *set) + struct nat_entry_set *set, struct cp_control *cpc) { struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); struct f2fs_journal *journal = curseg->journal; @@ -2224,7 +2444,8 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if (!__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; if (to_journal) { @@ -2241,8 +2462,7 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, nid_t nid = nat_get_nid(ne); int offset; - if (nat_get_blkaddr(ne) == NEW_ADDR) - continue; + f2fs_bug_on(sbi, nat_get_blkaddr(ne) == NEW_ADDR); if (to_journal) { offset = lookup_journal_in_cursum(journal, @@ -2255,30 +2475,38 @@ static void __flush_nat_entry_set(struct f2fs_sb_info *sbi, } raw_nat_from_node_info(raw_ne, &ne->ni); nat_reset_flag(ne); - __clear_nat_cache_dirty(NM_I(sbi), ne); + __clear_nat_cache_dirty(NM_I(sbi), set, ne); if (nat_get_blkaddr(ne) == NULL_ADDR) { add_free_nid(sbi, nid, false); spin_lock(&NM_I(sbi)->nid_list_lock); NM_I(sbi)->available_nids++; + update_free_nid_bitmap(sbi, nid, true, false); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } else { + spin_lock(&NM_I(sbi)->nid_list_lock); + update_free_nid_bitmap(sbi, nid, false, false); spin_unlock(&NM_I(sbi)->nid_list_lock); } } - if (to_journal) + if (to_journal) { up_write(&curseg->journal_rwsem); - else + } else { + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); + } - f2fs_bug_on(sbi, set->entry_cnt); - - radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); - kmem_cache_free(nat_entry_set_slab, set); + /* Allow dirty nats by node block allocation in write_begin */ + if (!set->entry_cnt) { + radix_tree_delete(&NM_I(sbi)->nat_set_root, set->set); + kmem_cache_free(nat_entry_set_slab, set); + } } /* * This function is called during the checkpointing process. */ -void flush_nat_entries(struct f2fs_sb_info *sbi) +void flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) { struct f2fs_nm_info *nm_i = NM_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -2299,7 +2527,8 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (!__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) + if (enabled_nat_bits(sbi, cpc) || + !__has_cursum_space(journal, nm_i->dirty_nat_cnt, NAT_JOURNAL)) remove_nats_in_journal(sbi); while ((found = __gang_lookup_nat_set(nm_i, @@ -2313,11 +2542,86 @@ void flush_nat_entries(struct f2fs_sb_info *sbi) /* flush dirty nats in nat entry set */ list_for_each_entry_safe(set, tmp, &sets, set_list) - __flush_nat_entry_set(sbi, set); + __flush_nat_entry_set(sbi, set, cpc); up_write(&nm_i->nat_tree_lock); + /* Allow dirty nats by node block allocation in write_begin */ +} - f2fs_bug_on(sbi, nm_i->dirty_nat_cnt); +static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) +{ + struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int nat_bits_bytes = nm_i->nat_blocks / BITS_PER_BYTE; + unsigned int i; + __u64 cp_ver = cur_cp_version(ckpt); + block_t nat_bits_addr; + + if (!enabled_nat_bits(sbi, NULL)) + return 0; + + nm_i->nat_bits_blocks = F2FS_BYTES_TO_BLK((nat_bits_bytes << 1) + 8 + + F2FS_BLKSIZE - 1); + nm_i->nat_bits = kzalloc(nm_i->nat_bits_blocks << F2FS_BLKSIZE_BITS, + GFP_KERNEL); + if (!nm_i->nat_bits) + return -ENOMEM; + + nat_bits_addr = __start_cp_addr(sbi) + sbi->blocks_per_seg - + nm_i->nat_bits_blocks; + for (i = 0; i < nm_i->nat_bits_blocks; i++) { + struct page *page = get_meta_page(sbi, nat_bits_addr++); + + memcpy(nm_i->nat_bits + (i << F2FS_BLKSIZE_BITS), + page_address(page), F2FS_BLKSIZE); + f2fs_put_page(page, 1); + } + + cp_ver |= (cur_cp_crc(ckpt) << 32); + if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { + disable_nat_bits(sbi, true); + return 0; + } + + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + + f2fs_msg(sbi->sb, KERN_NOTICE, "Found nat_bits in checkpoint"); + return 0; +} + +static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + unsigned int i = 0; + nid_t nid, last_nid; + + if (!enabled_nat_bits(sbi, NULL)) + return; + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->empty_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + + nid = i * NAT_ENTRY_PER_BLOCK; + last_nid = (i + 1) * NAT_ENTRY_PER_BLOCK; + + spin_lock(&NM_I(sbi)->nid_list_lock); + for (; nid < last_nid; nid++) + update_free_nid_bitmap(sbi, nid, true, true); + spin_unlock(&NM_I(sbi)->nid_list_lock); + } + + for (i = 0; i < nm_i->nat_blocks; i++) { + i = find_next_bit_le(nm_i->full_nat_bits, nm_i->nat_blocks, i); + if (i >= nm_i->nat_blocks) + break; + + __set_bit_le(i, nm_i->nat_block_bitmap); + } } static int init_node_manager(struct f2fs_sb_info *sbi) @@ -2325,15 +2629,15 @@ static int init_node_manager(struct f2fs_sb_info *sbi) struct f2fs_super_block *sb_raw = F2FS_RAW_SUPER(sbi); struct f2fs_nm_info *nm_i = NM_I(sbi); unsigned char *version_bitmap; - unsigned int nat_segs, nat_blocks; + unsigned int nat_segs; + int err; nm_i->nat_blkaddr = le32_to_cpu(sb_raw->nat_blkaddr); /* segment_count_nat includes pair segment so divide to 2. */ nat_segs = le32_to_cpu(sb_raw->segment_count_nat) >> 1; - nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); - - nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nat_blocks; + nm_i->nat_blocks = nat_segs << le32_to_cpu(sb_raw->log_blocks_per_seg); + nm_i->max_nid = NAT_ENTRY_PER_BLOCK * nm_i->nat_blocks; /* not used nids: 0, node, meta, (and root counted as valid node) */ nm_i->available_nids = nm_i->max_nid - sbi->total_valid_node_count - @@ -2367,6 +2671,10 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + err = __get_nat_bitmaps(sbi); + if (err) + return err; + #ifdef CONFIG_F2FS_CHECK_FS nm_i->nat_bitmap_mir = kmemdup(version_bitmap, nm_i->bitmap_size, GFP_KERNEL); @@ -2377,6 +2685,27 @@ static int init_node_manager(struct f2fs_sb_info *sbi) return 0; } +static int init_free_nid_cache(struct f2fs_sb_info *sbi) +{ + struct f2fs_nm_info *nm_i = NM_I(sbi); + + nm_i->free_nid_bitmap = kvzalloc(nm_i->nat_blocks * + NAT_ENTRY_BITMAP_SIZE, GFP_KERNEL); + if (!nm_i->free_nid_bitmap) + return -ENOMEM; + + nm_i->nat_block_bitmap = kvzalloc(nm_i->nat_blocks / 8, + GFP_KERNEL); + if (!nm_i->nat_block_bitmap) + return -ENOMEM; + + nm_i->free_nid_count = kvzalloc(nm_i->nat_blocks * + sizeof(unsigned short), GFP_KERNEL); + if (!nm_i->free_nid_count) + return -ENOMEM; + return 0; +} + int build_node_manager(struct f2fs_sb_info *sbi) { int err; @@ -2389,7 +2718,14 @@ int build_node_manager(struct f2fs_sb_info *sbi) if (err) return err; - build_free_nids(sbi, true); + err = init_free_nid_cache(sbi); + if (err) + return err; + + /* load free nid status from nat_bits table */ + load_free_nid_bitmap(sbi); + + build_free_nids(sbi, true, true); return 0; } @@ -2447,7 +2783,12 @@ void destroy_node_manager(struct f2fs_sb_info *sbi) } up_write(&nm_i->nat_tree_lock); + kvfree(nm_i->nat_block_bitmap); + kvfree(nm_i->free_nid_bitmap); + kvfree(nm_i->free_nid_count); + kfree(nm_i->nat_bitmap); + kfree(nm_i->nat_bits); #ifdef CONFIG_F2FS_CHECK_FS kfree(nm_i->nat_bitmap_mir); #endif diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 29ff783eb9c3..bb53e9955ff2 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -9,10 +9,10 @@ * published by the Free Software Foundation. */ /* start node id of a node block dedicated to the given node id */ -#define START_NID(nid) ((nid / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) +#define START_NID(nid) (((nid) / NAT_ENTRY_PER_BLOCK) * NAT_ENTRY_PER_BLOCK) /* node block offset on the NAT area dedicated to the given start node id */ -#define NAT_BLOCK_OFFSET(start_nid) (start_nid / NAT_ENTRY_PER_BLOCK) +#define NAT_BLOCK_OFFSET(start_nid) ((start_nid) / NAT_ENTRY_PER_BLOCK) /* # of pages to perform synchronous readahead before building free nids */ #define FREE_NID_PAGES 8 @@ -62,16 +62,16 @@ struct nat_entry { struct node_info ni; /* in-memory node information */ }; -#define nat_get_nid(nat) (nat->ni.nid) -#define nat_set_nid(nat, n) (nat->ni.nid = n) -#define nat_get_blkaddr(nat) (nat->ni.blk_addr) -#define nat_set_blkaddr(nat, b) (nat->ni.blk_addr = b) -#define nat_get_ino(nat) (nat->ni.ino) -#define nat_set_ino(nat, i) (nat->ni.ino = i) -#define nat_get_version(nat) (nat->ni.version) -#define nat_set_version(nat, v) (nat->ni.version = v) +#define nat_get_nid(nat) ((nat)->ni.nid) +#define nat_set_nid(nat, n) ((nat)->ni.nid = (n)) +#define nat_get_blkaddr(nat) ((nat)->ni.blk_addr) +#define nat_set_blkaddr(nat, b) ((nat)->ni.blk_addr = (b)) +#define nat_get_ino(nat) ((nat)->ni.ino) +#define nat_set_ino(nat, i) ((nat)->ni.ino = (i)) +#define nat_get_version(nat) ((nat)->ni.version) +#define nat_set_version(nat, v) ((nat)->ni.version = (v)) -#define inc_node_version(version) (++version) +#define inc_node_version(version) (++(version)) static inline void copy_node_info(struct node_info *dst, struct node_info *src) @@ -200,21 +200,18 @@ static inline pgoff_t current_nat_addr(struct f2fs_sb_info *sbi, nid_t start) struct f2fs_nm_info *nm_i = NM_I(sbi); pgoff_t block_off; pgoff_t block_addr; - int seg_off; + /* + * block_off = segment_off * 512 + off_in_segment + * OLD = (segment_off * 512) * 2 + off_in_segment + * NEW = 2 * (segment_off * 512 + off_in_segment) - off_in_segment + */ block_off = NAT_BLOCK_OFFSET(start); - seg_off = block_off >> sbi->log_blocks_per_seg; block_addr = (pgoff_t)(nm_i->nat_blkaddr + - (seg_off << sbi->log_blocks_per_seg << 1) + + (block_off << 1) - (block_off & (sbi->blocks_per_seg - 1))); -#ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_bit(block_off, nm_i->nat_bitmap) != - f2fs_test_bit(block_off, nm_i->nat_bitmap_mir)) - f2fs_bug_on(sbi, 1); -#endif - if (f2fs_test_bit(block_off, nm_i->nat_bitmap)) block_addr += sbi->blocks_per_seg; @@ -227,11 +224,7 @@ static inline pgoff_t next_nat_addr(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); block_addr -= nm_i->nat_blkaddr; - if ((block_addr >> sbi->log_blocks_per_seg) % 2) - block_addr -= sbi->blocks_per_seg; - else - block_addr += sbi->blocks_per_seg; - + block_addr ^= 1 << sbi->log_blocks_per_seg; return block_addr + nm_i->nat_blkaddr; } @@ -306,14 +299,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); struct f2fs_node *rn = F2FS_NODE(page); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); - __u64 cp_ver = le64_to_cpu(ckpt->checkpoint_ver); + __u64 cp_ver = cur_cp_version(ckpt); + + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } rn->footer.cp_ver = cpu_to_le64(cp_ver); rn->footer.next_blkaddr = cpu_to_le32(blkaddr); } @@ -321,14 +311,11 @@ static inline void fill_node_footer_blkaddr(struct page *page, block_t blkaddr) static inline bool is_recoverable_dnode(struct page *page) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(F2FS_P_SB(page)); - size_t crc_offset = le32_to_cpu(ckpt->checksum_offset); __u64 cp_ver = cur_cp_version(ckpt); - if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) { - __u64 crc = le32_to_cpu(*((__le32 *) - ((unsigned char *)ckpt + crc_offset))); - cp_ver |= (crc << 32); - } + if (__is_set_ckpt_flags(ckpt, CP_CRC_RECOVERY_FLAG)) + cp_ver |= (cur_cp_crc(ckpt) << 32); + return cp_ver == cpver_of_node(page); } @@ -358,7 +345,7 @@ static inline bool IS_DNODE(struct page *node_page) unsigned int ofs = ofs_of_node(node_page); if (f2fs_has_xattr_block(ofs)) - return false; + return true; if (ofs == 3 || ofs == 4 + NIDS_PER_BLOCK || ofs == 5 + 2 * NIDS_PER_BLOCK) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 4a3c48c24c10..9626758bc762 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -69,20 +69,34 @@ static struct fsync_inode_entry *get_fsync_inode(struct list_head *head, } static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi, - struct list_head *head, nid_t ino) + struct list_head *head, nid_t ino, bool quota_inode) { struct inode *inode; struct fsync_inode_entry *entry; + int err; inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return ERR_CAST(inode); + err = dquot_initialize(inode); + if (err) + goto err_out; + + if (quota_inode) { + err = dquot_alloc_inode(inode); + if (err) + goto err_out; + } + entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO); entry->inode = inode; list_add_tail(&entry->list, head); return entry; +err_out: + iput(inode); + return ERR_PTR(err); } static void del_fsync_inode(struct fsync_inode_entry *entry) @@ -107,7 +121,8 @@ static int recover_dentry(struct inode *inode, struct page *ipage, entry = get_fsync_inode(dir_list, pino); if (!entry) { - entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, pino); + entry = add_fsync_inode(F2FS_I_SB(inode), dir_list, + pino, false); if (IS_ERR(entry)) { dir = ERR_CAST(entry); err = PTR_ERR(entry); @@ -140,6 +155,13 @@ static int recover_dentry(struct inode *inode, struct page *ipage, err = -EEXIST; goto out_unmap_put; } + + err = dquot_initialize(einode); + if (err) { + iput(einode); + goto out_unmap_put; + } + err = acquire_orphan_inode(F2FS_I_SB(inode)); if (err) { iput(einode); @@ -198,7 +220,8 @@ static void recover_inode(struct inode *inode, struct page *page) ino_of_node(page), name); } -static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) +static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, + bool check_only) { struct curseg_info *curseg; struct page *page = NULL; @@ -225,17 +248,22 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head) entry = get_fsync_inode(head, ino_of_node(page)); if (!entry) { - if (IS_INODE(page) && is_dent_dnode(page)) { + bool quota_inode = false; + + if (!check_only && + IS_INODE(page) && is_dent_dnode(page)) { err = recover_inode_page(sbi, page); if (err) break; + quota_inode = true; } /* * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page)); + entry = add_fsync_inode(sbi, head, ino_of_node(page), + quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) { @@ -326,10 +354,18 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, f2fs_put_page(node_page, 1); if (ino != dn->inode->i_ino) { + int ret; + /* Deallocate previous index in the node page */ inode = f2fs_iget_retry(sbi->sb, ino); if (IS_ERR(inode)) return PTR_ERR(inode); + + ret = dquot_initialize(inode); + if (ret) { + iput(inode); + return ret; + } } else { inode = dn->inode; } @@ -359,7 +395,8 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, return 0; truncate_out: - if (datablock_addr(tdn.node_page, tdn.ofs_in_node) == blkaddr) + if (datablock_addr(tdn.inode, tdn.node_page, + tdn.ofs_in_node) == blkaddr) truncate_data_blocks_range(&tdn, 1); if (dn->inode->i_ino == nid && !dn->inode_page_locked) unlock_page(dn->inode_page); @@ -378,11 +415,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, if (IS_INODE(page)) { recover_inline_xattr(inode, page); } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - /* - * Deprecated; xattr blocks should be found from cold log. - * But, we should remain this for backward compatibility. - */ - recover_xattr_data(inode, page, blkaddr); + err = recover_xattr_data(inode, page, blkaddr); + if (!err) + recovered++; goto out; } @@ -414,8 +449,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, for (; start < end; start++, dn.ofs_in_node++) { block_t src, dest; - src = datablock_addr(dn.node_page, dn.ofs_in_node); - dest = datablock_addr(page, dn.ofs_in_node); + src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node); + dest = datablock_addr(dn.inode, page, dn.ofs_in_node); /* skip recovering if dest is the same as src */ if (src == dest) @@ -428,8 +463,9 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } if (!file_keep_isize(inode) && - (i_size_read(inode) <= (start << PAGE_SHIFT))) - f2fs_i_size_write(inode, (start + 1) << PAGE_SHIFT); + (i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT))) + f2fs_i_size_write(inode, + (loff_t)(start + 1) << PAGE_SHIFT); /* * dest is reserved block, invalidate src block @@ -556,12 +592,27 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) struct list_head dir_list; int err; int ret = 0; + unsigned long s_flags = sbi->sb->s_flags; bool need_writecp = false; + if (s_flags & MS_RDONLY) { + f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs"); + sbi->sb->s_flags &= ~MS_RDONLY; + } + +#ifdef CONFIG_QUOTA + /* Needed for iput() to work correctly and not trash data */ + sbi->sb->s_flags |= MS_ACTIVE; + /* Turn on quotas so that they are updated correctly */ + f2fs_enable_quota_files(sbi); +#endif + fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry", sizeof(struct fsync_inode_entry)); - if (!fsync_entry_slab) - return -ENOMEM; + if (!fsync_entry_slab) { + err = -ENOMEM; + goto out; + } INIT_LIST_HEAD(&inode_list); INIT_LIST_HEAD(&dir_list); @@ -570,13 +621,13 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) mutex_lock(&sbi->cp_mutex); /* step #1: find fsynced inode numbers */ - err = find_fsync_dnodes(sbi, &inode_list); + err = find_fsync_dnodes(sbi, &inode_list, check_only); if (err || list_empty(&inode_list)) - goto out; + goto skip; if (check_only) { ret = 1; - goto out; + goto skip; } need_writecp = true; @@ -585,7 +636,7 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) err = recover_data(sbi, &inode_list, &dir_list); if (!err) f2fs_bug_on(sbi, !list_empty(&inode_list)); -out: +skip: destroy_fsync_dnodes(&inode_list); /* truncate meta pages to be used by the recovery */ @@ -598,8 +649,6 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } clear_sbi_flag(sbi, SBI_POR_DOING); - if (err) - set_ckpt_flags(sbi, CP_ERROR_FLAG); mutex_unlock(&sbi->cp_mutex); /* let's drop all the directory inodes for clean checkpoint */ @@ -613,5 +662,12 @@ int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only) } kmem_cache_destroy(fsync_entry_slab); +out: +#ifdef CONFIG_QUOTA + /* Turn quotas off */ + f2fs_quota_off_umount(sbi->sb); +#endif + sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */ + return ret ? ret: err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 289b3facd2d8..059a219b7740 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -16,10 +16,13 @@ #include #include #include +#include +#include #include "f2fs.h" #include "segment.h" #include "node.h" +#include "gc.h" #include "trace.h" #include @@ -166,6 +169,21 @@ static unsigned long __find_rev_next_zero_bit(const unsigned long *addr, return result - size + __reverse_ffz(tmp); } +bool need_SSR(struct f2fs_sb_info *sbi) +{ + int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); + int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); + int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); + + if (test_opt(sbi, LFS)) + return false; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + return true; + + return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + + 2 * reserved_sections(sbi)); +} + void register_inmem_page(struct inode *inode, struct page *page) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -212,9 +230,15 @@ static int __revoke_inmem_pages(struct inode *inode, struct node_info ni; trace_f2fs_commit_inmem_page(page, INMEM_REVOKE); - +retry: set_new_dnode(&dn, inode, NULL, NULL, 0); - if (get_dnode_of_data(&dn, page->index, LOOKUP_NODE)) { + err = get_dnode_of_data(&dn, page->index, LOOKUP_NODE); + if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } err = -EAGAIN; goto next; } @@ -247,9 +271,40 @@ void drop_inmem_pages(struct inode *inode) mutex_unlock(&fi->inmem_lock); clear_inode_flag(inode, FI_ATOMIC_FILE); + clear_inode_flag(inode, FI_HOT_DATA); stat_dec_atomic_write(inode); } +void drop_inmem_page(struct inode *inode, struct page *page) +{ + struct f2fs_inode_info *fi = F2FS_I(inode); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + struct list_head *head = &fi->inmem_pages; + struct inmem_pages *cur = NULL; + + f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page)); + + mutex_lock(&fi->inmem_lock); + list_for_each_entry(cur, head, list) { + if (cur->page == page) + break; + } + + f2fs_bug_on(sbi, !cur || cur->page != page); + list_del(&cur->list); + mutex_unlock(&fi->inmem_lock); + + dec_page_count(sbi, F2FS_INMEM_PAGES); + kmem_cache_free(inmem_entry_slab, cur); + + ClearPageUptodate(page); + set_page_private(page, 0); + ClearPagePrivate(page); + f2fs_put_page(page, 0); + + trace_f2fs_commit_inmem_page(page, INMEM_INVALIDATE); +} + static int __commit_inmem_pages(struct inode *inode, struct list_head *revoke_list) { @@ -260,10 +315,10 @@ static int __commit_inmem_pages(struct inode *inode, .sbi = sbi, .type = DATA, .op = REQ_OP_WRITE, - .op_flags = REQ_SYNC | REQ_NOIDLE | REQ_PRIO, - .encrypted_page = NULL, + .op_flags = REQ_SYNC | REQ_PRIO, + .io_type = FS_DATA_IO, }; - bool submit_bio = false; + pgoff_t last_idx = ULONG_MAX; int err = 0; list_for_each_entry_safe(cur, tmp, &fi->inmem_pages, list) { @@ -279,25 +334,31 @@ static int __commit_inmem_pages(struct inode *inode, inode_dec_dirty_pages(inode); remove_dirty_inode(inode); } - +retry: fio.page = page; + fio.old_blkaddr = NULL_ADDR; + fio.encrypted_page = NULL; + fio.need_lock = LOCK_DONE; err = do_write_data_page(&fio); if (err) { + if (err == -ENOMEM) { + congestion_wait(BLK_RW_ASYNC, HZ/50); + cond_resched(); + goto retry; + } unlock_page(page); break; } - /* record old blkaddr for revoking */ cur->old_addr = fio.old_blkaddr; - - submit_bio = true; + last_idx = page->index; } unlock_page(page); list_move_tail(&cur->list, revoke_list); } - if (submit_bio) - f2fs_submit_merged_bio_cond(sbi, inode, NULL, 0, DATA, WRITE); + if (last_idx != ULONG_MAX) + f2fs_submit_merged_write_cond(sbi, inode, 0, last_idx, DATA); if (!err) __revoke_inmem_pages(inode, revoke_list, false, false); @@ -352,15 +413,14 @@ int commit_inmem_pages(struct inode *inode) void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) { #ifdef CONFIG_F2FS_FAULT_INJECTION - if (time_to_inject(sbi, FAULT_CHECKPOINT)) + if (time_to_inject(sbi, FAULT_CHECKPOINT)) { + f2fs_show_injection_info(FAULT_CHECKPOINT); f2fs_stop_checkpoint(sbi, false); + } #endif - if (!need) - return; - /* balance_fs_bg is able to be pending */ - if (excess_cached_nats(sbi)) + if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi); /* @@ -369,7 +429,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) */ if (has_not_enough_free_secs(sbi, 0, 0)) { mutex_lock(&sbi->gc_mutex); - f2fs_gc(sbi, false, false); + f2fs_gc(sbi, false, false, NULL_SEGNO); } } @@ -386,9 +446,9 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) if (!available_free_memory(sbi, FREE_NIDS)) try_to_free_nids(sbi, MAX_FREE_NIDS); else - build_free_nids(sbi, false); + build_free_nids(sbi, false, false); - if (!is_idle(sbi)) + if (!is_idle(sbi) && !excess_dirty_nats(sbi)) return; /* checkpoint is the only way to shrink partial cached entries */ @@ -409,7 +469,8 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi) } } -static int __submit_flush_wait(struct block_device *bdev) +static int __submit_flush_wait(struct f2fs_sb_info *sbi, + struct block_device *bdev) { struct bio *bio = f2fs_bio_alloc(0); int ret; @@ -418,20 +479,24 @@ static int __submit_flush_wait(struct block_device *bdev) bio->bi_bdev = bdev; ret = submit_bio_wait(WRITE_FLUSH, bio); bio_put(bio); + + trace_f2fs_issue_flush(bdev, test_opt(sbi, NOBARRIER), + test_opt(sbi, FLUSH_MERGE), ret); return ret; } static int submit_flush_wait(struct f2fs_sb_info *sbi) { - int ret = __submit_flush_wait(sbi->sb->s_bdev); + int ret = __submit_flush_wait(sbi, sbi->sb->s_bdev); int i; - if (sbi->s_ndevs && !ret) { - for (i = 1; i < sbi->s_ndevs; i++) { - ret = __submit_flush_wait(FDEV(i).bdev); - if (ret) - break; - } + if (!sbi->s_ndevs || ret) + return ret; + + for (i = 1; i < sbi->s_ndevs; i++) { + ret = __submit_flush_wait(sbi, FDEV(i).bdev); + if (ret) + break; } return ret; } @@ -445,6 +510,8 @@ static int issue_flush_thread(void *data) if (kthread_should_stop()) return 0; + sb_start_intwrite(sbi->sb); + if (!llist_empty(&fcc->issue_list)) { struct flush_cmd *cmd, *next; int ret; @@ -453,6 +520,8 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = llist_reverse_order(fcc->dispatch_list); ret = submit_flush_wait(sbi); + atomic_inc(&fcc->issued_flush); + llist_for_each_entry_safe(cmd, next, fcc->dispatch_list, llnode) { cmd->ret = ret; @@ -461,6 +530,8 @@ static int issue_flush_thread(void *data) fcc->dispatch_list = NULL; } + sb_end_intwrite(sbi->sb); + wait_event_interruptible(*q, kthread_should_stop() || !llist_empty(&fcc->issue_list)); goto repeat; @@ -470,36 +541,60 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi) { struct flush_cmd_control *fcc = SM_I(sbi)->fcc_info; struct flush_cmd cmd; - - trace_f2fs_issue_flush(sbi->sb, test_opt(sbi, NOBARRIER), - test_opt(sbi, FLUSH_MERGE)); + int ret; if (test_opt(sbi, NOBARRIER)) return 0; - if (!test_opt(sbi, FLUSH_MERGE) || !atomic_read(&fcc->submit_flush)) { - int ret; - - atomic_inc(&fcc->submit_flush); + if (!test_opt(sbi, FLUSH_MERGE)) { ret = submit_flush_wait(sbi); - atomic_dec(&fcc->submit_flush); + atomic_inc(&fcc->issued_flush); + return ret; + } + + if (atomic_inc_return(&fcc->issing_flush) == 1) { + ret = submit_flush_wait(sbi); + atomic_dec(&fcc->issing_flush); + + atomic_inc(&fcc->issued_flush); return ret; } init_completion(&cmd.wait); - atomic_inc(&fcc->submit_flush); llist_add(&cmd.llnode, &fcc->issue_list); - if (!fcc->dispatch_list) + /* update issue_list before we wake up issue_flush thread */ + smp_mb(); + + if (waitqueue_active(&fcc->flush_wait_queue)) wake_up(&fcc->flush_wait_queue); if (fcc->f2fs_issue_flush) { wait_for_completion(&cmd.wait); - atomic_dec(&fcc->submit_flush); + atomic_dec(&fcc->issing_flush); } else { - llist_del_all(&fcc->issue_list); - atomic_set(&fcc->submit_flush, 0); + struct llist_node *list; + + list = llist_del_all(&fcc->issue_list); + if (!list) { + wait_for_completion(&cmd.wait); + atomic_dec(&fcc->issing_flush); + } else { + struct flush_cmd *tmp, *next; + + ret = submit_flush_wait(sbi); + + llist_for_each_entry_safe(tmp, next, list, llnode) { + if (tmp == &cmd) { + cmd.ret = ret; + atomic_dec(&fcc->issing_flush); + continue; + } + tmp->ret = ret; + complete(&tmp->wait); + } + } } return cmd.ret; @@ -513,16 +608,22 @@ int create_flush_cmd_control(struct f2fs_sb_info *sbi) if (SM_I(sbi)->fcc_info) { fcc = SM_I(sbi)->fcc_info; + if (fcc->f2fs_issue_flush) + return err; goto init_thread; } fcc = kzalloc(sizeof(struct flush_cmd_control), GFP_KERNEL); if (!fcc) return -ENOMEM; - atomic_set(&fcc->submit_flush, 0); + atomic_set(&fcc->issued_flush, 0); + atomic_set(&fcc->issing_flush, 0); init_waitqueue_head(&fcc->flush_wait_queue); init_llist_head(&fcc->issue_list); SM_I(sbi)->fcc_info = fcc; + if (!test_opt(sbi, FLUSH_MERGE)) + return err; + init_thread: fcc->f2fs_issue_flush = kthread_run(issue_flush_thread, sbi, "f2fs_flush-%u:%u", MAJOR(dev), MINOR(dev)); @@ -592,8 +693,8 @@ static void __remove_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno, if (test_and_clear_bit(segno, dirty_i->dirty_segmap[t])) dirty_i->nr_dirty[t]--; - if (get_valid_blocks(sbi, segno, sbi->segs_per_sec) == 0) - clear_bit(GET_SECNO(sbi, segno), + if (get_valid_blocks(sbi, segno, true) == 0) + clear_bit(GET_SEC_FROM_SEG(sbi, segno), dirty_i->victim_secmap); } } @@ -613,7 +714,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == 0) { __locate_dirty_segment(sbi, segno, PRE); @@ -628,61 +729,91 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_unlock(&dirty_i->seglist_lock); } -static struct discard_cmd *__add_discard_cmd(struct f2fs_sb_info *sbi, - struct bio *bio, block_t lstart, block_t len) +static struct discard_cmd *__create_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; struct discard_cmd *dc; + f2fs_bug_on(sbi, !len); + + pend_list = &dcc->pend_list[plist_idx(len)]; + dc = f2fs_kmem_cache_alloc(discard_cmd_slab, GFP_NOFS); INIT_LIST_HEAD(&dc->list); - dc->bio = bio; + dc->bdev = bdev; dc->lstart = lstart; + dc->start = start; dc->len = len; + dc->ref = 0; + dc->state = D_PREP; + dc->error = 0; init_completion(&dc->wait); - list_add_tail(&dc->list, wait_list); + list_add_tail(&dc->list, pend_list); + atomic_inc(&dcc->discard_cmd_cnt); + dcc->undiscard_blks += len; return dc; } -/* This should be covered by global mutex, &sit_i->sentry_lock */ -void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +static struct discard_cmd *__attach_discard_cmd(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node *parent, struct rb_node **p) { - struct list_head *wait_list = &(SM_I(sbi)->discard_cmd_list); - struct discard_cmd *dc, *tmp; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; - list_for_each_entry_safe(dc, tmp, wait_list, list) { - struct bio *bio = dc->bio; - int err; + dc = __create_discard_cmd(sbi, bdev, lstart, start, len); - if (!completion_done(&dc->wait)) { - if ((dc->lstart <= blkaddr && - blkaddr < dc->lstart + dc->len) || - blkaddr == NULL_ADDR) - wait_for_completion_io(&dc->wait); - else - continue; - } + rb_link_node(&dc->rb_node, parent, p); + rb_insert_color(&dc->rb_node, &dcc->root); - err = bio->bi_error; - if (err == -EOPNOTSUPP) - err = 0; + return dc; +} - if (err) - f2fs_msg(sbi->sb, KERN_INFO, - "Issue discard failed, ret: %d", err); +static void __detach_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + if (dc->state == D_DONE) + atomic_dec(&dcc->issing_discard); - bio_put(bio); - list_del(&dc->list); - kmem_cache_free(discard_cmd_slab, dc); - } + list_del(&dc->list); + rb_erase(&dc->rb_node, &dcc->root); + dcc->undiscard_blks -= dc->len; + + kmem_cache_free(discard_cmd_slab, dc); + + atomic_dec(&dcc->discard_cmd_cnt); +} + +static void __remove_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + f2fs_bug_on(sbi, dc->ref); + + if (dc->error == -EOPNOTSUPP) + dc->error = 0; + + if (dc->error) + f2fs_msg(sbi->sb, KERN_INFO, + "Issue discard(%u, %u, %u) failed, ret: %d", + dc->lstart, dc->start, dc->len, dc->error); + __detach_discard_cmd(dcc, dc); } static void f2fs_submit_discard_endio(struct bio *bio) { struct discard_cmd *dc = (struct discard_cmd *)bio->bi_private; - complete(&dc->wait); + dc->error = bio->bi_error; + dc->state = D_DONE; + complete_all(&dc->wait); + bio_put(bio); } /* copied from block/blk-lib.c in 4.10-rc1 */ @@ -739,12 +870,12 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, } if (bio) { - int ret = submit_bio_wait(0, bio); + int ret = submit_bio_wait(op, bio); bio_put(bio); if (ret) return ret; } - bio = f2fs_bio_alloc(0); + bio = f2fs_bio_alloc(1); bio->bi_iter.bi_sector = sector; bio->bi_bdev = bdev; bio_set_op_attrs(bio, op, 0); @@ -766,58 +897,471 @@ static int __blkdev_issue_discard(struct block_device *bdev, sector_t sector, return 0; } +void __check_sit_bitmap(struct f2fs_sb_info *sbi, + block_t start, block_t end) +{ +#ifdef CONFIG_F2FS_CHECK_FS + struct seg_entry *sentry; + unsigned int segno; + block_t blk = start; + unsigned long offset, size, max_blocks = sbi->blocks_per_seg; + unsigned long *map; + + while (blk < end) { + segno = GET_SEGNO(sbi, blk); + sentry = get_seg_entry(sbi, segno); + offset = GET_BLKOFF_FROM_SEG0(sbi, blk); + + if (end < START_BLOCK(sbi, segno + 1)) + size = GET_BLKOFF_FROM_SEG0(sbi, end); + else + size = max_blocks; + map = (unsigned long *)(sentry->cur_valid_map); + offset = __find_rev_next_bit(map, size, offset); + f2fs_bug_on(sbi, offset != size); + blk = START_BLOCK(sbi, segno + 1); + } +#endif +} + /* this function is copied from blkdev_issue_discard from block/blk-lib.c */ -static int __f2fs_issue_discard_async(struct f2fs_sb_info *sbi, +static void __submit_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct bio *bio = NULL; + + if (dc->state != D_PREP) + return; + + trace_f2fs_issue_discard(dc->bdev, dc->start, dc->len); + + dc->error = __blkdev_issue_discard(dc->bdev, + SECTOR_FROM_BLOCK(dc->start), + SECTOR_FROM_BLOCK(dc->len), + GFP_NOFS, 0, &bio); + if (!dc->error) { + /* should keep before submission to avoid D_DONE right away */ + dc->state = D_SUBMIT; + atomic_inc(&dcc->issued_discard); + atomic_inc(&dcc->issing_discard); + if (bio) { + bio->bi_private = dc; + bio->bi_end_io = f2fs_submit_discard_endio; + submit_bio(REQ_SYNC, bio); + list_move_tail(&dc->list, &dcc->wait_list); + __check_sit_bitmap(sbi, dc->start, dc->start + dc->len); + + f2fs_update_iostat(sbi, FS_DISCARD, 1); + } + } else { + __remove_discard_cmd(sbi, dc); + } +} + +static struct discard_cmd *__insert_discard_tree(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len, + struct rb_node **insert_p, + struct rb_node *insert_parent) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct rb_node **p = &dcc->root.rb_node; + struct rb_node *parent = NULL; + struct discard_cmd *dc = NULL; + + if (insert_p && insert_parent) { + parent = insert_parent; + p = insert_p; + goto do_insert; + } + + p = __lookup_rb_tree_for_insert(sbi, &dcc->root, &parent, lstart); +do_insert: + dc = __attach_discard_cmd(sbi, bdev, lstart, start, len, parent, p); + if (!dc) + return NULL; + + return dc; +} + +static void __relocate_discard_cmd(struct discard_cmd_control *dcc, + struct discard_cmd *dc) +{ + list_move_tail(&dc->list, &dcc->pend_list[plist_idx(dc->len)]); +} + +static void __punch_discard_cmd(struct f2fs_sb_info *sbi, + struct discard_cmd *dc, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_info di = dc->di; + bool modified = false; + + if (dc->state == D_DONE || dc->len == 1) { + __remove_discard_cmd(sbi, dc); + return; + } + + dcc->undiscard_blks -= di.len; + + if (blkaddr > di.lstart) { + dc->len = blkaddr - dc->lstart; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + modified = true; + } + + if (blkaddr < di.lstart + di.len - 1) { + if (modified) { + __insert_discard_tree(sbi, dc->bdev, blkaddr + 1, + di.start + blkaddr + 1 - di.lstart, + di.lstart + di.len - 1 - blkaddr, + NULL, NULL); + } else { + dc->lstart++; + dc->len--; + dc->start++; + dcc->undiscard_blks += dc->len; + __relocate_discard_cmd(dcc, dc); + } + } +} + +static void __update_discard_tree_range(struct f2fs_sb_info *sbi, + struct block_device *bdev, block_t lstart, + block_t start, block_t len) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *prev_dc = NULL, *next_dc = NULL; + struct discard_cmd *dc; + struct discard_info di = {0}; + struct rb_node **insert_p = NULL, *insert_parent = NULL; + block_t end = lstart + len; + + mutex_lock(&dcc->cmd_lock); + + dc = (struct discard_cmd *)__lookup_rb_tree_ret(&dcc->root, + NULL, lstart, + (struct rb_entry **)&prev_dc, + (struct rb_entry **)&next_dc, + &insert_p, &insert_parent, true); + if (dc) + prev_dc = dc; + + if (!prev_dc) { + di.lstart = lstart; + di.len = next_dc ? next_dc->lstart - lstart : len; + di.len = min(di.len, len); + di.start = start; + } + + while (1) { + struct rb_node *node; + bool merged = false; + struct discard_cmd *tdc = NULL; + + if (prev_dc) { + di.lstart = prev_dc->lstart + prev_dc->len; + if (di.lstart < lstart) + di.lstart = lstart; + if (di.lstart >= end) + break; + + if (!next_dc || next_dc->lstart > end) + di.len = end - di.lstart; + else + di.len = next_dc->lstart - di.lstart; + di.start = start + di.lstart - lstart; + } + + if (!di.len) + goto next; + + if (prev_dc && prev_dc->state == D_PREP && + prev_dc->bdev == bdev && + __is_discard_back_mergeable(&di, &prev_dc->di)) { + prev_dc->di.len += di.len; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, prev_dc); + di = prev_dc->di; + tdc = prev_dc; + merged = true; + } + + if (next_dc && next_dc->state == D_PREP && + next_dc->bdev == bdev && + __is_discard_front_mergeable(&di, &next_dc->di)) { + next_dc->di.lstart = di.lstart; + next_dc->di.len += di.len; + next_dc->di.start = di.start; + dcc->undiscard_blks += di.len; + __relocate_discard_cmd(dcc, next_dc); + if (tdc) + __remove_discard_cmd(sbi, tdc); + merged = true; + } + + if (!merged) { + __insert_discard_tree(sbi, bdev, di.lstart, di.start, + di.len, NULL, NULL); + } + next: + prev_dc = next_dc; + if (!prev_dc) + break; + + node = rb_next(&prev_dc->rb_node); + next_dc = rb_entry_safe(node, struct discard_cmd, rb_node); + } + + mutex_unlock(&dcc->cmd_lock); +} + +static int __queue_discard_cmd(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - struct bio *bio = NULL; block_t lblkstart = blkstart; - int err; - trace_f2fs_issue_discard(sbi->sb, blkstart, blklen); + trace_f2fs_queue_discard(bdev, blkstart, blklen); if (sbi->s_ndevs) { int devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - err = __blkdev_issue_discard(bdev, - SECTOR_FROM_BLOCK(blkstart), - SECTOR_FROM_BLOCK(blklen), - GFP_NOFS, 0, &bio); - if (!err && bio) { - struct discard_cmd *dc = __add_discard_cmd(sbi, bio, - lblkstart, blklen); + __update_discard_tree_range(sbi, bdev, lblkstart, blkstart, blklen); + return 0; +} - bio->bi_private = dc; - bio->bi_end_io = f2fs_submit_discard_endio; - submit_bio(REQ_SYNC, bio); +static int __issue_discard_cmd(struct f2fs_sb_info *sbi, bool issue_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + struct blk_plug plug; + int iter = 0, issued = 0; + int i; + bool io_interrupted = false; + + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, + !__check_rb_tree_consistence(sbi, &dcc->root)); + blk_start_plug(&plug); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + + /* Hurry up to finish fstrim */ + if (dcc->pend_list_tag[i] & P_TRIM) { + __submit_discard_cmd(sbi, dc); + issued++; + + if (fatal_signal_pending(current)) + break; + continue; + } + + if (!issue_cond) { + __submit_discard_cmd(sbi, dc); + issued++; + continue; + } + + if (is_idle(sbi)) { + __submit_discard_cmd(sbi, dc); + issued++; + } else { + io_interrupted = true; + } + + if (++iter >= DISCARD_ISSUE_RATE) + goto out; + } + if (list_empty(pend_list) && dcc->pend_list_tag[i] & P_TRIM) + dcc->pend_list_tag[i] &= (~P_TRIM); } - return err; +out: + blk_finish_plug(&plug); + mutex_unlock(&dcc->cmd_lock); + + if (!issued && io_interrupted) + issued = -1; + + return issued; +} + +static void __drop_discard_cmd(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *pend_list; + struct discard_cmd *dc, *tmp; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; i >= 0; i--) { + pend_list = &dcc->pend_list[i]; + list_for_each_entry_safe(dc, tmp, pend_list, list) { + f2fs_bug_on(sbi, dc->state != D_PREP); + __remove_discard_cmd(sbi, dc); + } + } + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_one_discard_bio(struct f2fs_sb_info *sbi, + struct discard_cmd *dc) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + wait_for_completion_io(&dc->wait); + mutex_lock(&dcc->cmd_lock); + f2fs_bug_on(sbi, dc->state != D_DONE); + dc->ref--; + if (!dc->ref) + __remove_discard_cmd(sbi, dc); + mutex_unlock(&dcc->cmd_lock); +} + +static void __wait_discard_cmd(struct f2fs_sb_info *sbi, bool wait_cond) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *wait_list = &(dcc->wait_list); + struct discard_cmd *dc, *tmp; + bool need_wait; + +next: + need_wait = false; + + mutex_lock(&dcc->cmd_lock); + list_for_each_entry_safe(dc, tmp, wait_list, list) { + if (!wait_cond || (dc->state == D_DONE && !dc->ref)) { + wait_for_completion_io(&dc->wait); + __remove_discard_cmd(sbi, dc); + } else { + dc->ref++; + need_wait = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) { + __wait_one_discard_bio(sbi, dc); + goto next; + } +} + +/* This should be covered by global mutex, &sit_i->sentry_lock */ +void f2fs_wait_discard_bio(struct f2fs_sb_info *sbi, block_t blkaddr) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct discard_cmd *dc; + bool need_wait = false; + + mutex_lock(&dcc->cmd_lock); + dc = (struct discard_cmd *)__lookup_rb_tree(&dcc->root, NULL, blkaddr); + if (dc) { + if (dc->state == D_PREP) { + __punch_discard_cmd(sbi, dc, blkaddr); + } else { + dc->ref++; + need_wait = true; + } + } + mutex_unlock(&dcc->cmd_lock); + + if (need_wait) + __wait_one_discard_bio(sbi, dc); +} + +void stop_discard_thread(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (dcc && dcc->f2fs_issue_discard) { + struct task_struct *discard_thread = dcc->f2fs_issue_discard; + + dcc->f2fs_issue_discard = NULL; + kthread_stop(discard_thread); + } +} + +/* This comes from f2fs_put_super and f2fs_trim_fs */ +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +{ + __issue_discard_cmd(sbi, false); + __drop_discard_cmd(sbi); + __wait_discard_cmd(sbi, false); +} + +static void mark_discard_range_all(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) + dcc->pend_list_tag[i] |= P_TRIM; + mutex_unlock(&dcc->cmd_lock); +} + +static int issue_discard_thread(void *data) +{ + struct f2fs_sb_info *sbi = data; + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + wait_queue_head_t *q = &dcc->discard_wait_queue; + unsigned int wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + int issued; + + set_freezable(); + + do { + wait_event_interruptible_timeout(*q, + kthread_should_stop() || freezing(current) || + dcc->discard_wake, + msecs_to_jiffies(wait_ms)); + if (try_to_freeze()) + continue; + if (kthread_should_stop()) + return 0; + + if (dcc->discard_wake) { + dcc->discard_wake = 0; + if (sbi->gc_thread && sbi->gc_thread->gc_urgent) + mark_discard_range_all(sbi); + } + + sb_start_intwrite(sbi->sb); + + issued = __issue_discard_cmd(sbi, true); + if (issued) { + __wait_discard_cmd(sbi, true); + wait_ms = DEF_MIN_DISCARD_ISSUE_TIME; + } else { + wait_ms = DEF_MAX_DISCARD_ISSUE_TIME; + } + + sb_end_intwrite(sbi->sb); + + } while (!kthread_should_stop()); + return 0; } #ifdef CONFIG_BLK_DEV_ZONED static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, struct block_device *bdev, block_t blkstart, block_t blklen) { - sector_t nr_sects = SECTOR_FROM_BLOCK(blklen); - sector_t sector; + sector_t sector, nr_sects; + block_t lblkstart = blkstart; int devi = 0; if (sbi->s_ndevs) { devi = f2fs_target_device_index(sbi, blkstart); blkstart -= FDEV(devi).start_blk; } - sector = SECTOR_FROM_BLOCK(blkstart); - - if (sector & (bdev_zone_size(bdev) - 1) || - nr_sects != bdev_zone_size(bdev)) { - f2fs_msg(sbi->sb, KERN_INFO, - "(%d) %s: Unaligned discard attempted (block %x + %x)", - devi, sbi->s_ndevs ? FDEV(devi).path: "", - blkstart, blklen); - return -EIO; - } /* * We need to know the type of the zone: for conventional zones, @@ -829,10 +1373,21 @@ static int __f2fs_issue_discard_zone(struct f2fs_sb_info *sbi, case BLK_ZONE_TYPE_CONVENTIONAL: if (!blk_queue_discard(bdev_get_queue(bdev))) return 0; - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, lblkstart, blklen); case BLK_ZONE_TYPE_SEQWRITE_REQ: case BLK_ZONE_TYPE_SEQWRITE_PREF: - trace_f2fs_issue_reset_zone(sbi->sb, blkstart); + sector = SECTOR_FROM_BLOCK(blkstart); + nr_sects = SECTOR_FROM_BLOCK(blklen); + + if (sector & (bdev_zone_sectors(bdev) - 1) || + nr_sects != bdev_zone_sectors(bdev)) { + f2fs_msg(sbi->sb, KERN_INFO, + "(%d) %s: Unaligned discard attempted (block %x + %x)", + devi, sbi->s_ndevs ? FDEV(devi).path: "", + blkstart, blklen); + return -EIO; + } + trace_f2fs_issue_reset_zone(bdev, blkstart); return blkdev_reset_zones(bdev, sector, nr_sects, GFP_NOFS); default: @@ -850,7 +1405,7 @@ static int __issue_discard_async(struct f2fs_sb_info *sbi, bdev_zoned_model(bdev) != BLK_ZONED_NONE) return __f2fs_issue_discard_zone(sbi, bdev, blkstart, blklen); #endif - return __f2fs_issue_discard_async(sbi, bdev, blkstart, blklen); + return __queue_discard_cmd(sbi, bdev, blkstart, blklen); } static int f2fs_issue_discard(struct f2fs_sb_info *sbi, @@ -893,31 +1448,6 @@ static int f2fs_issue_discard(struct f2fs_sb_info *sbi, return err; } -static void __add_discard_entry(struct f2fs_sb_info *sbi, - struct cp_control *cpc, struct seg_entry *se, - unsigned int start, unsigned int end) -{ - struct list_head *head = &SM_I(sbi)->discard_entry_list; - struct discard_entry *new, *last; - - if (!list_empty(head)) { - last = list_last_entry(head, struct discard_entry, list); - if (START_BLOCK(sbi, cpc->trim_start) + start == - last->blkaddr + last->len) { - last->len += end - start; - goto done; - } - } - - new = f2fs_kmem_cache_alloc(discard_entry_slab, GFP_NOFS); - INIT_LIST_HEAD(&new->list); - new->blkaddr = START_BLOCK(sbi, cpc->trim_start) + start; - new->len = end - start; - list_add_tail(&new->list, head); -done: - SM_I(sbi)->nr_discards += end - start; -} - static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, bool check_only) { @@ -929,7 +1459,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, unsigned long *discard_map = (unsigned long *)se->discard_map; unsigned long *dmap = SIT_I(sbi)->tmp_map; unsigned int start = 0, end = -1; - bool force = (cpc->reason == CP_DISCARD); + bool force = (cpc->reason & CP_DISCARD); + struct discard_entry *de = NULL; + struct list_head *head = &SM_I(sbi)->dcc_info->entry_list; int i; if (se->valid_blocks == max_blocks || !f2fs_discard_en(sbi)) @@ -937,7 +1469,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (!force) { if (!test_opt(sbi, DISCARD) || !se->valid_blocks || - SM_I(sbi)->nr_discards >= SM_I(sbi)->max_discards) + SM_I(sbi)->dcc_info->nr_discards >= + SM_I(sbi)->dcc_info->max_discards) return false; } @@ -946,7 +1479,8 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, dmap[i] = force ? ~ckpt_map[i] & ~discard_map[i] : (cur_map[i] ^ ckpt_map[i]) & ckpt_map[i]; - while (force || SM_I(sbi)->nr_discards <= SM_I(sbi)->max_discards) { + while (force || SM_I(sbi)->dcc_info->nr_discards <= + SM_I(sbi)->dcc_info->max_discards) { start = __find_rev_next_bit(dmap, max_blocks, end + 1); if (start >= max_blocks) break; @@ -959,14 +1493,24 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, if (check_only) return true; - __add_discard_entry(sbi, cpc, se, start, end); + if (!de) { + de = f2fs_kmem_cache_alloc(discard_entry_slab, + GFP_F2FS_ZERO); + de->start_blkaddr = START_BLOCK(sbi, cpc->trim_start); + list_add_tail(&de->list, head); + } + + for (i = start; i < end; i++) + __set_bit_le(i, (void *)de->discard_map); + + SM_I(sbi)->dcc_info->nr_discards += end - start; } return false; } void release_discard_addrs(struct f2fs_sb_info *sbi) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct list_head *head = &(SM_I(sbi)->dcc_info->entry_list); struct discard_entry *entry, *this; /* drop caches */ @@ -992,16 +1536,14 @@ static void set_prefree_as_free_segments(struct f2fs_sb_info *sbi) void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) { - struct list_head *head = &(SM_I(sbi)->discard_entry_list); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + struct list_head *head = &dcc->entry_list; struct discard_entry *entry, *this; struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); - struct blk_plug plug; unsigned long *prefree_map = dirty_i->dirty_segmap[PRE]; unsigned int start = 0, end = -1; unsigned int secno, start_segno; - bool force = (cpc->reason == CP_DISCARD); - - blk_start_plug(&plug); + bool force = (cpc->reason & CP_DISCARD); mutex_lock(&dirty_i->seglist_lock); @@ -1031,32 +1573,117 @@ void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc) continue; } next: - secno = GET_SECNO(sbi, start); - start_segno = secno * sbi->segs_per_sec; + secno = GET_SEC_FROM_SEG(sbi, start); + start_segno = GET_SEG_FROM_SEC(sbi, secno); if (!IS_CURSEC(sbi, secno) && - !get_valid_blocks(sbi, start, sbi->segs_per_sec)) + !get_valid_blocks(sbi, start, true)) f2fs_issue_discard(sbi, START_BLOCK(sbi, start_segno), sbi->segs_per_sec << sbi->log_blocks_per_seg); start = start_segno + sbi->segs_per_sec; if (start < end) goto next; + else + end = start - 1; } mutex_unlock(&dirty_i->seglist_lock); /* send small discards */ list_for_each_entry_safe(entry, this, head, list) { - if (force && entry->len < cpc->trim_minlen) - goto skip; - f2fs_issue_discard(sbi, entry->blkaddr, entry->len); - cpc->trimmed += entry->len; + unsigned int cur_pos = 0, next_pos, len, total_len = 0; + bool is_valid = test_bit_le(0, entry->discard_map); + +find_next: + if (is_valid) { + next_pos = find_next_zero_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + len = next_pos - cur_pos; + + if (f2fs_sb_mounted_blkzoned(sbi->sb) || + (force && len < cpc->trim_minlen)) + goto skip; + + f2fs_issue_discard(sbi, entry->start_blkaddr + cur_pos, + len); + cpc->trimmed += len; + total_len += len; + } else { + next_pos = find_next_bit_le(entry->discard_map, + sbi->blocks_per_seg, cur_pos); + } skip: + cur_pos = next_pos; + is_valid = !is_valid; + + if (cur_pos < sbi->blocks_per_seg) + goto find_next; + list_del(&entry->list); - SM_I(sbi)->nr_discards -= entry->len; + dcc->nr_discards -= total_len; kmem_cache_free(discard_entry_slab, entry); } - blk_finish_plug(&plug); + wake_up_discard_thread(sbi, false); +} + +static int create_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + dev_t dev = sbi->sb->s_bdev->bd_dev; + struct discard_cmd_control *dcc; + int err = 0, i; + + if (SM_I(sbi)->dcc_info) { + dcc = SM_I(sbi)->dcc_info; + goto init_thread; + } + + dcc = kzalloc(sizeof(struct discard_cmd_control), GFP_KERNEL); + if (!dcc) + return -ENOMEM; + + dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; + INIT_LIST_HEAD(&dcc->entry_list); + for (i = 0; i < MAX_PLIST_NUM; i++) { + INIT_LIST_HEAD(&dcc->pend_list[i]); + if (i >= dcc->discard_granularity - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + } + INIT_LIST_HEAD(&dcc->wait_list); + mutex_init(&dcc->cmd_lock); + atomic_set(&dcc->issued_discard, 0); + atomic_set(&dcc->issing_discard, 0); + atomic_set(&dcc->discard_cmd_cnt, 0); + dcc->nr_discards = 0; + dcc->max_discards = MAIN_SEGS(sbi) << sbi->log_blocks_per_seg; + dcc->undiscard_blks = 0; + dcc->root = RB_ROOT; + + init_waitqueue_head(&dcc->discard_wait_queue); + SM_I(sbi)->dcc_info = dcc; +init_thread: + dcc->f2fs_issue_discard = kthread_run(issue_discard_thread, sbi, + "f2fs_discard-%u:%u", MAJOR(dev), MINOR(dev)); + if (IS_ERR(dcc->f2fs_issue_discard)) { + err = PTR_ERR(dcc->f2fs_issue_discard); + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; + return err; + } + + return err; +} + +static void destroy_discard_cmd_control(struct f2fs_sb_info *sbi) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + + if (!dcc) + return; + + stop_discard_thread(sbi); + + kfree(dcc); + SM_I(sbi)->dcc_info = NULL; } static bool __mark_sit_entry_dirty(struct f2fs_sb_info *sbi, unsigned int segno) @@ -1085,6 +1712,10 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) struct seg_entry *se; unsigned int segno, offset; long int new_vblocks; + bool exist; +#ifdef CONFIG_F2FS_CHECK_FS + bool mir_exist; +#endif segno = GET_SEGNO(sbi, blkaddr); @@ -1101,32 +1732,54 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - if (f2fs_test_and_set_bit(offset, se->cur_valid_map)) { + exist = f2fs_test_and_set_bit(offset, se->cur_valid_map); #ifdef CONFIG_F2FS_CHECK_FS - if (f2fs_test_and_set_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else + mir_exist = f2fs_test_and_set_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when setting bitmap, blk:%u, old bit:%d", + blkaddr, exist); f2fs_bug_on(sbi, 1); -#endif } +#endif + if (unlikely(exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly set, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks--; + del = 0; + } + if (f2fs_discard_en(sbi) && !f2fs_test_and_set_bit(offset, se->discard_map)) sbi->discard_blks--; - } else { - if (!f2fs_test_and_clear_bit(offset, se->cur_valid_map)) { -#ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_test_and_clear_bit(offset, - se->cur_valid_map_mir)) - f2fs_bug_on(sbi, 1); - else - WARN_ON(1); -#else - f2fs_bug_on(sbi, 1); -#endif + + /* don't overwrite by SSR to keep node chain */ + if (se->type == CURSEG_WARM_NODE) { + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + se->ckpt_valid_blocks++; } + } else { + exist = f2fs_test_and_clear_bit(offset, se->cur_valid_map); +#ifdef CONFIG_F2FS_CHECK_FS + mir_exist = f2fs_test_and_clear_bit(offset, + se->cur_valid_map_mir); + if (unlikely(exist != mir_exist)) { + f2fs_msg(sbi->sb, KERN_ERR, "Inconsistent error " + "when clearing bitmap, blk:%u, old bit:%d", + blkaddr, exist); + f2fs_bug_on(sbi, 1); + } +#endif + if (unlikely(!exist)) { + f2fs_msg(sbi->sb, KERN_ERR, + "Bitmap was wrongly cleared, blk:%u", blkaddr); + f2fs_bug_on(sbi, 1); + se->valid_blocks++; + del = 0; + } + if (f2fs_discard_en(sbi) && f2fs_test_and_clear_bit(offset, se->discard_map)) sbi->discard_blks++; @@ -1312,8 +1965,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, struct free_segmap_info *free_i = FREE_I(sbi); unsigned int segno, secno, zoneno; unsigned int total_zones = MAIN_SECS(sbi) / sbi->secs_per_zone; - unsigned int hint = *newseg / sbi->segs_per_sec; - unsigned int old_zoneno = GET_ZONENO_FROM_SEGNO(sbi, *newseg); + unsigned int hint = GET_SEC_FROM_SEG(sbi, *newseg); + unsigned int old_zoneno = GET_ZONE_FROM_SEG(sbi, *newseg); unsigned int left_start = hint; bool init = true; int go_left = 0; @@ -1323,8 +1976,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, if (!new_sec && ((*newseg + 1) % sbi->segs_per_sec)) { segno = find_next_zero_bit(free_i->free_segmap, - (hint + 1) * sbi->segs_per_sec, *newseg + 1); - if (segno < (hint + 1) * sbi->segs_per_sec) + GET_SEG_FROM_SEC(sbi, hint + 1), *newseg + 1); + if (segno < GET_SEG_FROM_SEC(sbi, hint + 1)) goto got_it; } find_other_zone: @@ -1355,8 +2008,8 @@ static void get_new_segment(struct f2fs_sb_info *sbi, secno = left_start; skip_left: hint = secno; - segno = secno * sbi->segs_per_sec; - zoneno = secno / sbi->secs_per_zone; + segno = GET_SEG_FROM_SEC(sbi, secno); + zoneno = GET_ZONE_FROM_SEC(sbi, secno); /* give up on finding another zone */ if (!init) @@ -1400,7 +2053,7 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) struct summary_footer *sum_footer; curseg->segno = curseg->next_segno; - curseg->zone = GET_ZONENO_FROM_SEGNO(sbi, curseg->segno); + curseg->zone = GET_ZONE_FROM_SEG(sbi, curseg->segno); curseg->next_blkoff = 0; curseg->next_segno = NULL_SEGNO; @@ -1413,6 +2066,20 @@ static void reset_curseg(struct f2fs_sb_info *sbi, int type, int modified) __set_sit_entry_type(sbi, type, curseg->segno, modified); } +static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) +{ + /* if segs_per_sec is large than 1, we need to keep original policy. */ + if (sbi->segs_per_sec != 1) + return CURSEG_I(sbi, type)->segno; + + if (type == CURSEG_HOT_DATA || IS_NODESEG(type)) + return 0; + + if (SIT_I(sbi)->last_victim[ALLOC_NEXT]) + return SIT_I(sbi)->last_victim[ALLOC_NEXT]; + return CURSEG_I(sbi, type)->segno; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -1431,6 +2098,7 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) if (test_opt(sbi, NOHEAP)) dir = ALLOC_RIGHT; + segno = __get_next_segno(sbi, type); get_new_segment(sbi, &segno, new_sec, dir); curseg->next_segno = segno; reset_curseg(sbi, type, 1); @@ -1473,7 +2141,7 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks */ -static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) +static void change_curseg(struct f2fs_sb_info *sbi, int type) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1494,28 +2162,53 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool reuse) curseg->alloc_type = SSR; __next_free_blkoff(sbi, curseg, 0); - if (reuse) { - sum_page = get_sum_page(sbi, new_segno); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); - memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); - } + sum_page = get_sum_page(sbi, new_segno); + sum_node = (struct f2fs_summary_block *)page_address(sum_page); + memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); + f2fs_put_page(sum_page, 1); } static int get_ssr_segment(struct f2fs_sb_info *sbi, int type) { struct curseg_info *curseg = CURSEG_I(sbi, type); const struct victim_selection *v_ops = DIRTY_I(sbi)->v_ops; + unsigned segno = NULL_SEGNO; + int i, cnt; + bool reversed = false; - if (IS_NODESEG(type) || !has_not_enough_free_secs(sbi, 0, 0)) - return v_ops->get_victim(sbi, - &(curseg)->next_segno, BG_GC, type, SSR); + /* need_SSR() already forces to do this */ + if (v_ops->get_victim(sbi, &segno, BG_GC, type, SSR)) { + curseg->next_segno = segno; + return 1; + } - /* For data segments, let's do SSR more intensively */ - for (; type >= CURSEG_HOT_DATA; type--) - if (v_ops->get_victim(sbi, &(curseg)->next_segno, - BG_GC, type, SSR)) + /* For node segments, let's do SSR more intensively */ + if (IS_NODESEG(type)) { + if (type >= CURSEG_WARM_NODE) { + reversed = true; + i = CURSEG_COLD_NODE; + } else { + i = CURSEG_HOT_NODE; + } + cnt = NR_CURSEG_NODE_TYPE; + } else { + if (type >= CURSEG_WARM_DATA) { + reversed = true; + i = CURSEG_COLD_DATA; + } else { + i = CURSEG_HOT_DATA; + } + cnt = NR_CURSEG_DATA_TYPE; + } + + for (; cnt-- > 0; reversed ? i-- : i++) { + if (i == type) + continue; + if (v_ops->get_victim(sbi, &segno, BG_GC, i, SSR)) { + curseg->next_segno = segno; return 1; + } + } return 0; } @@ -1530,12 +2223,13 @@ static void allocate_segment_by_default(struct f2fs_sb_info *sbi, if (force) new_curseg(sbi, type, true); - else if (type == CURSEG_WARM_NODE) + else if (!is_set_ckpt_flags(sbi, CP_CRC_RECOVERY_FLAG) && + type == CURSEG_WARM_NODE) new_curseg(sbi, type, false); else if (curseg->alloc_type == LFS && is_next_segment_free(sbi, type)) new_curseg(sbi, type, false); else if (need_SSR(sbi) && get_ssr_segment(sbi, type)) - change_curseg(sbi, type, true); + change_curseg(sbi, type); else new_curseg(sbi, type, false); @@ -1563,14 +2257,19 @@ static const struct segment_allocation default_salloc_ops = { bool exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc) { __u64 trim_start = cpc->trim_start; + bool has_candidate = false; mutex_lock(&SIT_I(sbi)->sentry_lock); - for (; trim_start <= cpc->trim_end; trim_start++) - if (add_discard_addrs(sbi, cpc, true)) + for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) { + if (add_discard_addrs(sbi, cpc, true)) { + has_candidate = true; break; + } + } mutex_unlock(&SIT_I(sbi)->sentry_lock); - return trim_start <= cpc->trim_end; + cpc->trim_start = trim_start; + return has_candidate; } int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) @@ -1623,6 +2322,9 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) schedule(); } + /* It's time to issue all the filed discards */ + mark_discard_range_all(sbi); + f2fs_wait_discard_bios(sbi); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; @@ -1636,68 +2338,80 @@ static bool __has_curseg_space(struct f2fs_sb_info *sbi, int type) return false; } -static int __get_segment_type_2(struct page *page, enum page_type p_type) +static int __get_segment_type_2(struct f2fs_io_info *fio) { - if (p_type == DATA) + if (fio->type == DATA) return CURSEG_HOT_DATA; else return CURSEG_HOT_NODE; } -static int __get_segment_type_4(struct page *page, enum page_type p_type) +static int __get_segment_type_4(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; else return CURSEG_COLD_DATA; } else { - if (IS_DNODE(page) && is_cold_node(page)) + if (IS_DNODE(fio->page) && is_cold_node(fio->page)) return CURSEG_WARM_NODE; else return CURSEG_COLD_NODE; } } -static int __get_segment_type_6(struct page *page, enum page_type p_type) +static int __get_segment_type_6(struct f2fs_io_info *fio) { - if (p_type == DATA) { - struct inode *inode = page->mapping->host; + if (fio->type == DATA) { + struct inode *inode = fio->page->mapping->host; - if (S_ISDIR(inode->i_mode)) - return CURSEG_HOT_DATA; - else if (is_cold_data(page) || file_is_cold(inode)) + if (is_cold_data(fio->page) || file_is_cold(inode)) return CURSEG_COLD_DATA; - else - return CURSEG_WARM_DATA; + if (is_inode_flag_set(inode, FI_HOT_DATA)) + return CURSEG_HOT_DATA; + return CURSEG_WARM_DATA; } else { - if (IS_DNODE(page)) - return is_cold_node(page) ? CURSEG_WARM_NODE : + if (IS_DNODE(fio->page)) + return is_cold_node(fio->page) ? CURSEG_WARM_NODE : CURSEG_HOT_NODE; - else - return CURSEG_COLD_NODE; + return CURSEG_COLD_NODE; } } -static int __get_segment_type(struct page *page, enum page_type p_type) +static int __get_segment_type(struct f2fs_io_info *fio) { - switch (F2FS_P_SB(page)->active_logs) { + int type = 0; + + switch (fio->sbi->active_logs) { case 2: - return __get_segment_type_2(page, p_type); + type = __get_segment_type_2(fio); + break; case 4: - return __get_segment_type_4(page, p_type); + type = __get_segment_type_4(fio); + break; + case 6: + type = __get_segment_type_6(fio); + break; + default: + f2fs_bug_on(fio->sbi, true); } - /* NR_CURSEG_TYPE(6) logs by default */ - f2fs_bug_on(F2FS_P_SB(page), - F2FS_P_SB(page)->active_logs != NR_CURSEG_TYPE); - return __get_segment_type_6(page, p_type); + + if (IS_HOT(type)) + fio->temp = HOT; + else if (IS_WARM(type)) + fio->temp = WARM; + else + fio->temp = COLD; + return type; } void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, - struct f2fs_summary *sum, int type) + struct f2fs_summary *sum, int type, + struct f2fs_io_info *fio, bool add_list) { struct sit_info *sit_i = SIT_I(sbi); struct curseg_info *curseg = CURSEG_I(sbi, type); @@ -1723,42 +2437,52 @@ void allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, if (!__has_curseg_space(sbi, type)) sit_i->s_ops->allocate_segment(sbi, type, false); /* - * SIT information should be updated before segment allocation, - * since SSR needs latest valid block information. + * SIT information should be updated after segment allocation, + * since we need to keep dirty segments precisely under SSR. */ refresh_sit_entry(sbi, old_blkaddr, *new_blkaddr); mutex_unlock(&sit_i->sentry_lock); - if (page && IS_NODESEG(type)) + if (page && IS_NODESEG(type)) { fill_node_footer_blkaddr(page, NEXT_FREE_BLKADDR(sbi, curseg)); + f2fs_inode_chksum_set(sbi, page); + } + + if (add_list) { + struct f2fs_bio_info *io; + + INIT_LIST_HEAD(&fio->list); + fio->in_list = true; + io = sbi->write_io[fio->type] + fio->temp; + spin_lock(&io->io_lock); + list_add_tail(&fio->list, &io->io_list); + spin_unlock(&io->io_lock); + } + mutex_unlock(&curseg->curseg_mutex); } static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { - int type = __get_segment_type(fio->page, fio->type); + int type = __get_segment_type(fio); int err; - if (fio->type == NODE || fio->type == DATA) - mutex_lock(&fio->sbi->wio_mutex[fio->type]); reallocate: allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, - &fio->new_blkaddr, sum, type); + &fio->new_blkaddr, sum, type, fio, true); /* writeout dirty page into bdev */ - err = f2fs_submit_page_mbio(fio); + err = f2fs_submit_page_write(fio); if (err == -EAGAIN) { fio->old_blkaddr = fio->new_blkaddr; goto reallocate; } - - if (fio->type == NODE || fio->type == DATA) - mutex_unlock(&fio->sbi->wio_mutex[fio->type]); } -void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) +void write_meta_page(struct f2fs_sb_info *sbi, struct page *page, + enum iostat_type io_type) { struct f2fs_io_info fio = { .sbi = sbi, @@ -1769,13 +2493,16 @@ void write_meta_page(struct f2fs_sb_info *sbi, struct page *page) .new_blkaddr = page->index, .page = page, .encrypted_page = NULL, + .in_list = false, }; if (unlikely(page->index >= MAIN_BLKADDR(sbi))) fio.op_flags &= ~REQ_META; set_page_writeback(page); - f2fs_submit_page_mbio(&fio); + f2fs_submit_page_write(&fio); + + f2fs_update_iostat(sbi, io_type, F2FS_BLKSIZE); } void write_node_page(unsigned int nid, struct f2fs_io_info *fio) @@ -1784,6 +2511,8 @@ void write_node_page(unsigned int nid, struct f2fs_io_info *fio) set_summary(&sum, nid, 0, 0); do_write_page(&sum, fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); } void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) @@ -1797,13 +2526,22 @@ void write_data_page(struct dnode_of_data *dn, struct f2fs_io_info *fio) set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version); do_write_page(&sum, fio); f2fs_update_data_blkaddr(dn, fio->new_blkaddr); + + f2fs_update_iostat(sbi, fio->io_type, F2FS_BLKSIZE); } -void rewrite_data_page(struct f2fs_io_info *fio) +int rewrite_data_page(struct f2fs_io_info *fio) { + int err; + fio->new_blkaddr = fio->old_blkaddr; stat_inc_inplace_blocks(fio->sbi); - f2fs_submit_page_mbio(fio); + + err = f2fs_submit_page_bio(fio); + + f2fs_update_iostat(fio->sbi, fio->io_type, F2FS_BLKSIZE); + + return err; } void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, @@ -1845,7 +2583,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, /* change the current segment */ if (segno != curseg->segno) { curseg->next_segno = segno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = GET_BLKOFF_FROM_SEG0(sbi, new_blkaddr); @@ -1864,7 +2602,7 @@ void __f2fs_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (recover_curseg) { if (old_cursegno != curseg->segno) { curseg->next_segno = old_cursegno; - change_curseg(sbi, type, true); + change_curseg(sbi, type); } curseg->next_blkoff = old_blkoff; } @@ -1894,7 +2632,8 @@ void f2fs_wait_on_page_writeback(struct page *page, if (PageWriteback(page)) { struct f2fs_sb_info *sbi = F2FS_P_SB(page); - f2fs_submit_merged_bio_cond(sbi, NULL, page, 0, type, WRITE); + f2fs_submit_merged_write_cond(sbi, page->mapping->host, + 0, page->index, type); if (ordered) wait_on_page_writeback(page); else @@ -1902,8 +2641,7 @@ void f2fs_wait_on_page_writeback(struct page *page, } } -void f2fs_wait_on_encrypted_page_writeback(struct f2fs_sb_info *sbi, - block_t blkaddr) +void f2fs_wait_on_block_writeback(struct f2fs_sb_info *sbi, block_t blkaddr) { struct page *cpage; @@ -2052,6 +2790,8 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) static int restore_curseg_summaries(struct f2fs_sb_info *sbi) { + struct f2fs_journal *sit_j = CURSEG_I(sbi, CURSEG_COLD_DATA)->journal; + struct f2fs_journal *nat_j = CURSEG_I(sbi, CURSEG_HOT_DATA)->journal; int type = CURSEG_HOT_DATA; int err; @@ -2078,6 +2818,11 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) return err; } + /* sanity check for summary blocks */ + if (nats_in_cursum(nat_j) > NAT_JOURNAL_ENTRIES || + sits_in_cursum(sit_j) > SIT_JOURNAL_ENTRIES) + return -EINVAL; + return 0; } @@ -2367,7 +3112,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) se = get_seg_entry(sbi, segno); /* add discard candidates */ - if (cpc->reason != CP_DISCARD) { + if (!(cpc->reason & CP_DISCARD)) { cpc->trim_start = segno; add_discard_addrs(sbi, cpc, false); } @@ -2403,7 +3148,7 @@ void flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) f2fs_bug_on(sbi, !list_empty(head)); f2fs_bug_on(sbi, sit_i->dirty_sentries); out: - if (cpc->reason == CP_DISCARD) { + if (cpc->reason & CP_DISCARD) { __u64 trim_start = cpc->trim_start; for (; cpc->trim_start <= cpc->trim_end; cpc->trim_start++) @@ -2431,13 +3176,13 @@ static int build_sit_info(struct f2fs_sb_info *sbi) SM_I(sbi)->sit_info = sit_i; - sit_i->sentries = f2fs_kvzalloc(MAIN_SEGS(sbi) * + sit_i->sentries = kvzalloc(MAIN_SEGS(sbi) * sizeof(struct seg_entry), GFP_KERNEL); if (!sit_i->sentries) return -ENOMEM; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - sit_i->dirty_sentries_bitmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + sit_i->dirty_sentries_bitmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!sit_i->dirty_sentries_bitmap) return -ENOMEM; @@ -2470,7 +3215,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi) return -ENOMEM; if (sbi->segs_per_sec > 1) { - sit_i->sec_entries = f2fs_kvzalloc(MAIN_SECS(sbi) * + sit_i->sec_entries = kvzalloc(MAIN_SECS(sbi) * sizeof(struct sec_entry), GFP_KERNEL); if (!sit_i->sec_entries) return -ENOMEM; @@ -2521,12 +3266,12 @@ static int build_free_segmap(struct f2fs_sb_info *sbi) SM_I(sbi)->free_info = free_i; bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); - free_i->free_segmap = f2fs_kvmalloc(bitmap_size, GFP_KERNEL); + free_i->free_segmap = kvmalloc(bitmap_size, GFP_KERNEL); if (!free_i->free_segmap) return -ENOMEM; sec_bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - free_i->free_secmap = f2fs_kvmalloc(sec_bitmap_size, GFP_KERNEL); + free_i->free_secmap = kvmalloc(sec_bitmap_size, GFP_KERNEL); if (!free_i->free_secmap) return -ENOMEM; @@ -2602,10 +3347,17 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) /* build discard map only one time */ if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += sbi->blocks_per_seg - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, + se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += + sbi->blocks_per_seg - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2629,10 +3381,15 @@ static void build_sit_entries(struct f2fs_sb_info *sbi) seg_info_from_raw_sit(se, &sit); if (f2fs_discard_en(sbi)) { - memcpy(se->discard_map, se->cur_valid_map, - SIT_VBLOCK_MAP_SIZE); - sbi->discard_blks += old_valid_blocks - - se->valid_blocks; + if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) { + memset(se->discard_map, 0xff, + SIT_VBLOCK_MAP_SIZE); + } else { + memcpy(se->discard_map, se->cur_valid_map, + SIT_VBLOCK_MAP_SIZE); + sbi->discard_blks += old_valid_blocks - + se->valid_blocks; + } } if (sbi->segs_per_sec > 1) @@ -2676,7 +3433,7 @@ static void init_dirty_segmap(struct f2fs_sb_info *sbi) if (segno >= MAIN_SEGS(sbi)) break; offset = segno + 1; - valid_blocks = get_valid_blocks(sbi, segno, 0); + valid_blocks = get_valid_blocks(sbi, segno, false); if (valid_blocks == sbi->blocks_per_seg || !valid_blocks) continue; if (valid_blocks > sbi->blocks_per_seg) { @@ -2694,7 +3451,7 @@ static int init_victim_secmap(struct f2fs_sb_info *sbi) struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); unsigned int bitmap_size = f2fs_bitmap_size(MAIN_SECS(sbi)); - dirty_i->victim_secmap = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->victim_secmap = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->victim_secmap) return -ENOMEM; return 0; @@ -2716,7 +3473,7 @@ static int build_dirty_segmap(struct f2fs_sb_info *sbi) bitmap_size = f2fs_bitmap_size(MAIN_SEGS(sbi)); for (i = 0; i < NR_DIRTY_TYPE; i++) { - dirty_i->dirty_segmap[i] = f2fs_kvzalloc(bitmap_size, GFP_KERNEL); + dirty_i->dirty_segmap[i] = kvzalloc(bitmap_size, GFP_KERNEL); if (!dirty_i->dirty_segmap[i]) return -ENOMEM; } @@ -2782,22 +3539,22 @@ int build_segment_manager(struct f2fs_sb_info *sbi) sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC; sm_info->min_ipu_util = DEF_MIN_IPU_UTIL; sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS; - - INIT_LIST_HEAD(&sm_info->discard_entry_list); - INIT_LIST_HEAD(&sm_info->discard_cmd_list); - sm_info->nr_discards = 0; - sm_info->max_discards = 0; + sm_info->min_hot_blocks = DEF_MIN_HOT_BLOCKS; sm_info->trim_sections = DEF_BATCHED_TRIM_SECTIONS; INIT_LIST_HEAD(&sm_info->sit_entry_set); - if (test_opt(sbi, FLUSH_MERGE) && !f2fs_readonly(sbi->sb)) { + if (!f2fs_readonly(sbi->sb)) { err = create_flush_cmd_control(sbi); if (err) return err; } + err = create_discard_cmd_control(sbi); + if (err) + return err; + err = build_sit_info(sbi); if (err) return err; @@ -2919,6 +3676,7 @@ void destroy_segment_manager(struct f2fs_sb_info *sbi) if (!sm_info) return; destroy_flush_cmd_control(sbi, true); + destroy_discard_cmd_control(sbi); destroy_dirty_segmap(sbi); destroy_curseg(sbi); destroy_free_segmap(sbi); diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 5cb5755c75d9..ffa11274b0ce 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -21,78 +21,88 @@ #define F2FS_MIN_SEGMENTS 9 /* SB + 2 (CP + SIT + NAT) + SSA + MAIN */ /* L: Logical segment # in volume, R: Relative segment # in main area */ -#define GET_L2R_SEGNO(free_i, segno) (segno - free_i->start_segno) -#define GET_R2L_SEGNO(free_i, segno) (segno + free_i->start_segno) +#define GET_L2R_SEGNO(free_i, segno) ((segno) - (free_i)->start_segno) +#define GET_R2L_SEGNO(free_i, segno) ((segno) + (free_i)->start_segno) -#define IS_DATASEG(t) (t <= CURSEG_COLD_DATA) -#define IS_NODESEG(t) (t >= CURSEG_HOT_NODE) +#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA) +#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE) + +#define IS_HOT(t) ((t) == CURSEG_HOT_NODE || (t) == CURSEG_HOT_DATA) +#define IS_WARM(t) ((t) == CURSEG_WARM_NODE || (t) == CURSEG_WARM_DATA) +#define IS_COLD(t) ((t) == CURSEG_COLD_NODE || (t) == CURSEG_COLD_DATA) #define IS_CURSEG(sbi, seg) \ - ((seg == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ - (seg == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) + (((seg) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno) || \ + ((seg) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno)) #define IS_CURSEC(sbi, secno) \ - ((secno == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ - sbi->segs_per_sec) || \ - (secno == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ - sbi->segs_per_sec)) \ + (((secno) == CURSEG_I(sbi, CURSEG_HOT_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_DATA)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_HOT_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_WARM_NODE)->segno / \ + (sbi)->segs_per_sec) || \ + ((secno) == CURSEG_I(sbi, CURSEG_COLD_NODE)->segno / \ + (sbi)->segs_per_sec)) \ #define MAIN_BLKADDR(sbi) (SM_I(sbi)->main_blkaddr) #define SEG0_BLKADDR(sbi) (SM_I(sbi)->seg0_blkaddr) #define MAIN_SEGS(sbi) (SM_I(sbi)->main_segments) -#define MAIN_SECS(sbi) (sbi->total_sections) +#define MAIN_SECS(sbi) ((sbi)->total_sections) #define TOTAL_SEGS(sbi) (SM_I(sbi)->segment_count) -#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << sbi->log_blocks_per_seg) +#define TOTAL_BLKS(sbi) (TOTAL_SEGS(sbi) << (sbi)->log_blocks_per_seg) #define MAX_BLKADDR(sbi) (SEG0_BLKADDR(sbi) + TOTAL_BLKS(sbi)) -#define SEGMENT_SIZE(sbi) (1ULL << (sbi->log_blocksize + \ - sbi->log_blocks_per_seg)) +#define SEGMENT_SIZE(sbi) (1ULL << ((sbi)->log_blocksize + \ + (sbi)->log_blocks_per_seg)) #define START_BLOCK(sbi, segno) (SEG0_BLKADDR(sbi) + \ - (GET_R2L_SEGNO(FREE_I(sbi), segno) << sbi->log_blocks_per_seg)) + (GET_R2L_SEGNO(FREE_I(sbi), segno) << (sbi)->log_blocks_per_seg)) #define NEXT_FREE_BLKADDR(sbi, curseg) \ - (START_BLOCK(sbi, curseg->segno) + curseg->next_blkoff) + (START_BLOCK(sbi, (curseg)->segno) + (curseg)->next_blkoff) #define GET_SEGOFF_FROM_SEG0(sbi, blk_addr) ((blk_addr) - SEG0_BLKADDR(sbi)) #define GET_SEGNO_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> sbi->log_blocks_per_seg) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) >> (sbi)->log_blocks_per_seg) #define GET_BLKOFF_FROM_SEG0(sbi, blk_addr) \ - (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & (sbi->blocks_per_seg - 1)) + (GET_SEGOFF_FROM_SEG0(sbi, blk_addr) & ((sbi)->blocks_per_seg - 1)) #define GET_SEGNO(sbi, blk_addr) \ - (((blk_addr == NULL_ADDR) || (blk_addr == NEW_ADDR)) ? \ + ((((blk_addr) == NULL_ADDR) || ((blk_addr) == NEW_ADDR)) ? \ NULL_SEGNO : GET_L2R_SEGNO(FREE_I(sbi), \ GET_SEGNO_FROM_SEG0(sbi, blk_addr))) -#define GET_SECNO(sbi, segno) \ - ((segno) / sbi->segs_per_sec) -#define GET_ZONENO_FROM_SEGNO(sbi, segno) \ - ((segno / sbi->segs_per_sec) / sbi->secs_per_zone) +#define BLKS_PER_SEC(sbi) \ + ((sbi)->segs_per_sec * (sbi)->blocks_per_seg) +#define GET_SEC_FROM_SEG(sbi, segno) \ + ((segno) / (sbi)->segs_per_sec) +#define GET_SEG_FROM_SEC(sbi, secno) \ + ((secno) * (sbi)->segs_per_sec) +#define GET_ZONE_FROM_SEC(sbi, secno) \ + ((secno) / (sbi)->secs_per_zone) +#define GET_ZONE_FROM_SEG(sbi, segno) \ + GET_ZONE_FROM_SEC(sbi, GET_SEC_FROM_SEG(sbi, segno)) #define GET_SUM_BLOCK(sbi, segno) \ - ((sbi->sm_info->ssa_blkaddr) + segno) + ((sbi)->sm_info->ssa_blkaddr + (segno)) #define GET_SUM_TYPE(footer) ((footer)->entry_type) -#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = type) +#define SET_SUM_TYPE(footer, type) ((footer)->entry_type = (type)) #define SIT_ENTRY_OFFSET(sit_i, segno) \ - (segno % sit_i->sents_per_block) + ((segno) % (sit_i)->sents_per_block) #define SIT_BLOCK_OFFSET(segno) \ - (segno / SIT_ENTRY_PER_BLOCK) + ((segno) / SIT_ENTRY_PER_BLOCK) #define START_SEGNO(segno) \ (SIT_BLOCK_OFFSET(segno) * SIT_ENTRY_PER_BLOCK) #define SIT_BLK_CNT(sbi) \ @@ -103,7 +113,7 @@ #define SECTOR_FROM_BLOCK(blk_addr) \ (((sector_t)blk_addr) << F2FS_LOG_SECTORS_PER_BLOCK) #define SECTOR_TO_BLOCK(sectors) \ - (sectors >> F2FS_LOG_SECTORS_PER_BLOCK) + ((sectors) >> F2FS_LOG_SECTORS_PER_BLOCK) /* * indicate a block allocation direction: RIGHT and LEFT. @@ -132,7 +142,10 @@ enum { */ enum { GC_CB = 0, - GC_GREEDY + GC_GREEDY, + ALLOC_NEXT, + FLUSH_DEVICE, + MAX_GC_POLICY, }; /* @@ -227,6 +240,8 @@ struct sit_info { unsigned long long mounted_time; /* mount time */ unsigned long long min_mtime; /* min. modification time */ unsigned long long max_mtime; /* max. modification time */ + + unsigned int last_victim[MAX_GC_POLICY]; /* last victim segment # */ }; struct free_segmap_info { @@ -303,17 +318,17 @@ static inline struct sec_entry *get_sec_entry(struct f2fs_sb_info *sbi, unsigned int segno) { struct sit_info *sit_i = SIT_I(sbi); - return &sit_i->sec_entries[GET_SECNO(sbi, segno)]; + return &sit_i->sec_entries[GET_SEC_FROM_SEG(sbi, segno)]; } static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno, int section) + unsigned int segno, bool use_section) { /* * In order to get # of valid blocks in a section instantly from many * segments, f2fs manages two counting structures separately. */ - if (section > 1) + if (use_section && sbi->segs_per_sec > 1) return get_sec_entry(sbi, segno)->valid_blocks; else return get_seg_entry(sbi, segno)->valid_blocks; @@ -358,8 +373,8 @@ static inline unsigned int find_next_inuse(struct free_segmap_info *free_i, static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -379,7 +394,8 @@ static inline void __set_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + set_bit(segno, free_i->free_segmap); free_i->free_segments--; if (!test_and_set_bit(secno, free_i->free_secmap)) @@ -390,8 +406,8 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; - unsigned int start_segno = secno * sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; spin_lock(&free_i->segmap_lock); @@ -412,7 +428,8 @@ static inline void __set_test_and_inuse(struct f2fs_sb_info *sbi, unsigned int segno) { struct free_segmap_info *free_i = FREE_I(sbi); - unsigned int secno = segno / sbi->segs_per_sec; + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + spin_lock(&free_i->segmap_lock); if (!test_and_set_bit(segno, free_i->free_segmap)) { free_i->free_segments--; @@ -475,27 +492,9 @@ static inline int overprovision_segments(struct f2fs_sb_info *sbi) return SM_I(sbi)->ovp_segments; } -static inline int overprovision_sections(struct f2fs_sb_info *sbi) -{ - return ((unsigned int) overprovision_segments(sbi)) / sbi->segs_per_sec; -} - static inline int reserved_sections(struct f2fs_sb_info *sbi) { - return ((unsigned int) reserved_segments(sbi)) / sbi->segs_per_sec; -} - -static inline bool need_SSR(struct f2fs_sb_info *sbi) -{ - int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES); - int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS); - int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA); - - if (test_opt(sbi, LFS)) - return false; - - return free_sections(sbi) <= (node_secs + 2 * dent_secs + imeta_secs + - reserved_sections(sbi) + 1); + return GET_SEC_FROM_SEG(sbi, (unsigned int)reserved_segments(sbi)); } static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi, @@ -540,6 +539,7 @@ static inline int utilization(struct f2fs_sb_info *sbi) */ #define DEF_MIN_IPU_UTIL 70 #define DEF_MIN_FSYNC_BLOCKS 8 +#define DEF_MIN_HOT_BLOCKS 16 enum { F2FS_IPU_FORCE, @@ -547,20 +547,22 @@ enum { F2FS_IPU_UTIL, F2FS_IPU_SSR_UTIL, F2FS_IPU_FSYNC, + F2FS_IPU_ASYNC, }; -static inline bool need_inplace_update(struct inode *inode) +static inline bool need_inplace_update_policy(struct inode *inode, + struct f2fs_io_info *fio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int policy = SM_I(sbi)->ipu_policy; - /* IPU can be done only for the user data */ - if (S_ISDIR(inode->i_mode) || f2fs_is_atomic_file(inode)) - return false; - if (test_opt(sbi, LFS)) return false; + /* if this is cold file, we should overwrite to avoid fragmentation */ + if (file_is_cold(inode)) + return true; + if (policy & (0x1 << F2FS_IPU_FORCE)) return true; if (policy & (0x1 << F2FS_IPU_SSR) && need_SSR(sbi)) @@ -572,6 +574,15 @@ static inline bool need_inplace_update(struct inode *inode) utilization(sbi) > SM_I(sbi)->min_ipu_util) return true; + /* + * IPU for rewrite async pages + */ + if (policy & (0x1 << F2FS_IPU_ASYNC) && + fio && fio->op == REQ_OP_WRITE && + !(fio->op_flags & REQ_SYNC) && + !f2fs_encrypted_inode(inode)) + return true; + /* this is only set during fdatasync */ if (policy & (0x1 << F2FS_IPU_FSYNC) && is_inode_flag_set(inode, FI_NEED_IPU)) @@ -716,6 +727,15 @@ static inline block_t sum_blk_addr(struct f2fs_sb_info *sbi, int base, int type) - (base + 1) + type; } +static inline bool no_fggc_candidate(struct f2fs_sb_info *sbi, + unsigned int secno) +{ + if (get_valid_blocks(sbi, GET_SEG_FROM_SEC(sbi, secno), true) >= + sbi->fggc_threshold) + return true; + return false; +} + static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) { if (IS_CURSEC(sbi, secno) || (sbi->cur_victim_sec == secno)) @@ -727,8 +747,8 @@ static inline bool sec_usage_check(struct f2fs_sb_info *sbi, unsigned int secno) * It is very important to gather dirty pages and write at once, so that we can * submit a big bio without interfering other data writes. * By default, 512 pages for directory data, - * 512 pages (2MB) * 3 for three types of nodes, and - * max_bio_blocks for meta are set. + * 512 pages (2MB) * 8 for nodes, and + * 256 pages * 8 for meta are set. */ static inline int nr_pages_to_skip(struct f2fs_sb_info *sbi, int type) { @@ -764,3 +784,28 @@ static inline long nr_pages_to_write(struct f2fs_sb_info *sbi, int type, wbc->nr_to_write = desired; return desired - nr_to_write; } + +static inline void wake_up_discard_thread(struct f2fs_sb_info *sbi, bool force) +{ + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + bool wakeup = false; + int i; + + if (force) + goto wake_up; + + mutex_lock(&dcc->cmd_lock); + for (i = MAX_PLIST_NUM - 1; + i >= 0 && plist_issue(dcc->pend_list_tag[i]); i--) { + if (!list_empty(&dcc->pend_list[i])) { + wakeup = true; + break; + } + } + mutex_unlock(&dcc->cmd_lock); + if (!wakeup) + return; +wake_up: + dcc->discard_wake = 1; + wake_up_interruptible_all(&dcc->discard_wait_queue); +} diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 84d5686c4aa4..315e59ad1483 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -22,8 +22,10 @@ #include #include #include +#include #include #include +#include #include "f2fs.h" #include "node.h" @@ -35,9 +37,7 @@ #define CREATE_TRACE_POINTS #include -static struct proc_dir_entry *f2fs_proc_root; static struct kmem_cache *f2fs_inode_cachep; -static struct kset *f2fs_kset; #ifdef CONFIG_F2FS_FAULT_INJECTION @@ -49,6 +49,7 @@ char *fault_name[FAULT_MAX] = { [FAULT_BLOCK] = "no more block", [FAULT_DIR_DEPTH] = "too big dir depth", [FAULT_EVICT_INODE] = "evict_inode fail", + [FAULT_TRUNCATE] = "truncate fail", [FAULT_IO] = "IO error", [FAULT_CHECKPOINT] = "checkpoint error", }; @@ -82,6 +83,7 @@ enum { Opt_discard, Opt_nodiscard, Opt_noheap, + Opt_heap, Opt_user_xattr, Opt_nouser_xattr, Opt_acl, @@ -89,6 +91,7 @@ enum { Opt_active_logs, Opt_disable_ext_identify, Opt_inline_xattr, + Opt_noinline_xattr, Opt_inline_data, Opt_inline_dentry, Opt_noinline_dentry, @@ -105,6 +108,20 @@ enum { Opt_fault_injection, Opt_lazytime, Opt_nolazytime, + Opt_quota, + Opt_noquota, + Opt_usrquota, + Opt_grpquota, + Opt_prjquota, + Opt_usrjquota, + Opt_grpjquota, + Opt_prjjquota, + Opt_offusrjquota, + Opt_offgrpjquota, + Opt_offprjjquota, + Opt_jqfmt_vfsold, + Opt_jqfmt_vfsv0, + Opt_jqfmt_vfsv1, Opt_err, }; @@ -115,6 +132,7 @@ static match_table_t f2fs_tokens = { {Opt_discard, "discard"}, {Opt_nodiscard, "nodiscard"}, {Opt_noheap, "no_heap"}, + {Opt_heap, "heap"}, {Opt_user_xattr, "user_xattr"}, {Opt_nouser_xattr, "nouser_xattr"}, {Opt_acl, "acl"}, @@ -122,6 +140,7 @@ static match_table_t f2fs_tokens = { {Opt_active_logs, "active_logs=%u"}, {Opt_disable_ext_identify, "disable_ext_identify"}, {Opt_inline_xattr, "inline_xattr"}, + {Opt_noinline_xattr, "noinline_xattr"}, {Opt_inline_data, "inline_data"}, {Opt_inline_dentry, "inline_dentry"}, {Opt_noinline_dentry, "noinline_dentry"}, @@ -138,207 +157,23 @@ static match_table_t f2fs_tokens = { {Opt_fault_injection, "fault_injection=%u"}, {Opt_lazytime, "lazytime"}, {Opt_nolazytime, "nolazytime"}, + {Opt_quota, "quota"}, + {Opt_noquota, "noquota"}, + {Opt_usrquota, "usrquota"}, + {Opt_grpquota, "grpquota"}, + {Opt_prjquota, "prjquota"}, + {Opt_usrjquota, "usrjquota=%s"}, + {Opt_grpjquota, "grpjquota=%s"}, + {Opt_prjjquota, "prjjquota=%s"}, + {Opt_offusrjquota, "usrjquota="}, + {Opt_offgrpjquota, "grpjquota="}, + {Opt_offprjjquota, "prjjquota="}, + {Opt_jqfmt_vfsold, "jqfmt=vfsold"}, + {Opt_jqfmt_vfsv0, "jqfmt=vfsv0"}, + {Opt_jqfmt_vfsv1, "jqfmt=vfsv1"}, {Opt_err, NULL}, }; -/* Sysfs support for f2fs */ -enum { - GC_THREAD, /* struct f2fs_gc_thread */ - SM_INFO, /* struct f2fs_sm_info */ - NM_INFO, /* struct f2fs_nm_info */ - F2FS_SBI, /* struct f2fs_sb_info */ -#ifdef CONFIG_F2FS_FAULT_INJECTION - FAULT_INFO_RATE, /* struct f2fs_fault_info */ - FAULT_INFO_TYPE, /* struct f2fs_fault_info */ -#endif -}; - -struct f2fs_attr { - struct attribute attr; - ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); - ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, - const char *, size_t); - int struct_type; - int offset; -}; - -static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) -{ - if (struct_type == GC_THREAD) - return (unsigned char *)sbi->gc_thread; - else if (struct_type == SM_INFO) - return (unsigned char *)SM_I(sbi); - else if (struct_type == NM_INFO) - return (unsigned char *)NM_I(sbi); - else if (struct_type == F2FS_SBI) - return (unsigned char *)sbi; -#ifdef CONFIG_F2FS_FAULT_INJECTION - else if (struct_type == FAULT_INFO_RATE || - struct_type == FAULT_INFO_TYPE) - return (unsigned char *)&sbi->fault_info; -#endif - return NULL; -} - -static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - struct super_block *sb = sbi->sb; - - if (!sb->s_bdev->bd_part) - return snprintf(buf, PAGE_SIZE, "0\n"); - - return snprintf(buf, PAGE_SIZE, "%llu\n", - (unsigned long long)(sbi->kbytes_written + - BD_PART_WRITTEN(sbi))); -} - -static ssize_t f2fs_sbi_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) -{ - unsigned char *ptr = NULL; - unsigned int *ui; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - return snprintf(buf, PAGE_SIZE, "%u\n", *ui); -} - -static ssize_t f2fs_sbi_store(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, - const char *buf, size_t count) -{ - unsigned char *ptr; - unsigned long t; - unsigned int *ui; - ssize_t ret; - - ptr = __struct_ptr(sbi, a->struct_type); - if (!ptr) - return -EINVAL; - - ui = (unsigned int *)(ptr + a->offset); - - ret = kstrtoul(skip_spaces(buf), 0, &t); - if (ret < 0) - return ret; -#ifdef CONFIG_F2FS_FAULT_INJECTION - if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) - return -EINVAL; -#endif - *ui = t; - return count; -} - -static ssize_t f2fs_attr_show(struct kobject *kobj, - struct attribute *attr, char *buf) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->show ? a->show(a, sbi, buf) : 0; -} - -static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); - - return a->store ? a->store(a, sbi, buf, len) : 0; -} - -static void f2fs_sb_release(struct kobject *kobj) -{ - struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, - s_kobj); - complete(&sbi->s_kobj_unregister); -} - -#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ -static struct f2fs_attr f2fs_attr_##_name = { \ - .attr = {.name = __stringify(_name), .mode = _mode }, \ - .show = _show, \ - .store = _store, \ - .struct_type = _struct_type, \ - .offset = _offset \ -} - -#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ - F2FS_ATTR_OFFSET(struct_type, name, 0644, \ - f2fs_sbi_show, f2fs_sbi_store, \ - offsetof(struct struct_name, elname)) - -#define F2FS_GENERAL_RO_ATTR(name) \ -static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) - -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); -F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, max_small_discards, max_discards); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); -F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); -F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); -F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); -#ifdef CONFIG_F2FS_FAULT_INJECTION -F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); -F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); -#endif -F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); - -#define ATTR_LIST(name) (&f2fs_attr_##name.attr) -static struct attribute *f2fs_attrs[] = { - ATTR_LIST(gc_min_sleep_time), - ATTR_LIST(gc_max_sleep_time), - ATTR_LIST(gc_no_gc_sleep_time), - ATTR_LIST(gc_idle), - ATTR_LIST(reclaim_segments), - ATTR_LIST(max_small_discards), - ATTR_LIST(batched_trim_sections), - ATTR_LIST(ipu_policy), - ATTR_LIST(min_ipu_util), - ATTR_LIST(min_fsync_blocks), - ATTR_LIST(max_victim_search), - ATTR_LIST(dir_level), - ATTR_LIST(ram_thresh), - ATTR_LIST(ra_nid_pages), - ATTR_LIST(dirty_nats_ratio), - ATTR_LIST(cp_interval), - ATTR_LIST(idle_interval), -#ifdef CONFIG_F2FS_FAULT_INJECTION - ATTR_LIST(inject_rate), - ATTR_LIST(inject_type), -#endif - ATTR_LIST(lifetime_write_kbytes), - NULL, -}; - -static const struct sysfs_ops f2fs_attr_ops = { - .show = f2fs_attr_show, - .store = f2fs_attr_store, -}; - -static struct kobj_type f2fs_ktype = { - .default_attrs = f2fs_attrs, - .sysfs_ops = &f2fs_attr_ops, - .release = f2fs_sb_release, -}; - void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) { struct va_format vaf; @@ -347,7 +182,7 @@ void f2fs_msg(struct super_block *sb, const char *level, const char *fmt, ...) va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - printk("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); + printk_ratelimited("%sF2FS-fs (%s): %pV\n", level, sb->s_id, &vaf); va_end(args); } @@ -358,6 +193,104 @@ static void init_once(void *foo) inode_init_once(&fi->vfs_inode); } +#ifdef CONFIG_QUOTA +static const char * const quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) +static int f2fs_set_qf_name(struct super_block *sb, int qtype, + substring_t *args) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + char *qname; + int ret = -EINVAL; + + if (sb_any_quota_loaded(sb) && !sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, + "Cannot change journaled " + "quota options when quota turned on"); + return -EINVAL; + } + qname = match_strdup(args); + if (!qname) { + f2fs_msg(sb, KERN_ERR, + "Not enough memory for storing quotafile name"); + return -EINVAL; + } + if (sbi->s_qf_names[qtype]) { + if (strcmp(sbi->s_qf_names[qtype], qname) == 0) + ret = 0; + else + f2fs_msg(sb, KERN_ERR, + "%s quota file already specified", + QTYPE2NAME(qtype)); + goto errout; + } + if (strchr(qname, '/')) { + f2fs_msg(sb, KERN_ERR, + "quotafile must be on filesystem root"); + goto errout; + } + sbi->s_qf_names[qtype] = qname; + set_opt(sbi, QUOTA); + return 0; +errout: + kfree(qname); + return ret; +} + +static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +{ + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sb_any_quota_loaded(sb) && sbi->s_qf_names[qtype]) { + f2fs_msg(sb, KERN_ERR, "Cannot change journaled quota options" + " when quota turned on"); + return -EINVAL; + } + kfree(sbi->s_qf_names[qtype]); + sbi->s_qf_names[qtype] = NULL; + return 0; +} + +static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) +{ + /* + * We do the test below only for project quotas. 'usrquota' and + * 'grpquota' mount options are allowed even without quota feature + * to support legacy quotas in quota files. + */ + if (test_opt(sbi, PRJQUOTA) && !f2fs_sb_has_project_quota(sbi->sb)) { + f2fs_msg(sbi->sb, KERN_ERR, "Project quota feature not enabled. " + "Cannot enable project quota enforcement."); + return -1; + } + if (sbi->s_qf_names[USRQUOTA] || sbi->s_qf_names[GRPQUOTA] || + sbi->s_qf_names[PRJQUOTA]) { + if (test_opt(sbi, USRQUOTA) && sbi->s_qf_names[USRQUOTA]) + clear_opt(sbi, USRQUOTA); + + if (test_opt(sbi, GRPQUOTA) && sbi->s_qf_names[GRPQUOTA]) + clear_opt(sbi, GRPQUOTA); + + if (test_opt(sbi, PRJQUOTA) && sbi->s_qf_names[PRJQUOTA]) + clear_opt(sbi, PRJQUOTA); + + if (test_opt(sbi, GRPQUOTA) || test_opt(sbi, USRQUOTA) || + test_opt(sbi, PRJQUOTA)) { + f2fs_msg(sbi->sb, KERN_ERR, "old and new quota " + "format mixing"); + return -1; + } + + if (!sbi->s_jquota_fmt) { + f2fs_msg(sbi->sb, KERN_ERR, "journaled quota format " + "not specified"); + return -1; + } + } + return 0; +} +#endif + static int parse_options(struct super_block *sb, char *options) { struct f2fs_sb_info *sbi = F2FS_SB(sb); @@ -365,6 +298,9 @@ static int parse_options(struct super_block *sb, char *options) substring_t args[MAX_OPT_ARGS]; char *p, *name; int arg = 0; +#ifdef CONFIG_QUOTA + int ret; +#endif if (!options) return 0; @@ -431,6 +367,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_noheap: set_opt(sbi, NOHEAP); break; + case Opt_heap: + clear_opt(sbi, NOHEAP); + break; #ifdef CONFIG_F2FS_FS_XATTR case Opt_user_xattr: set_opt(sbi, XATTR_USER); @@ -441,6 +380,9 @@ static int parse_options(struct super_block *sb, char *options) case Opt_inline_xattr: set_opt(sbi, INLINE_XATTR); break; + case Opt_noinline_xattr: + clear_opt(sbi, INLINE_XATTR); + break; #else case Opt_user_xattr: f2fs_msg(sb, KERN_INFO, @@ -454,6 +396,10 @@ static int parse_options(struct super_block *sb, char *options) f2fs_msg(sb, KERN_INFO, "inline_xattr options not supported"); break; + case Opt_noinline_xattr: + f2fs_msg(sb, KERN_INFO, + "noinline_xattr options not supported"); + break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL case Opt_acl: @@ -553,6 +499,7 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; #ifdef CONFIG_F2FS_FAULT_INJECTION f2fs_build_fault_attr(sbi, arg); + set_opt(sbi, FAULT_INJECTION); #else f2fs_msg(sb, KERN_INFO, "FAULT_INJECTION was not selected"); @@ -564,6 +511,81 @@ static int parse_options(struct super_block *sb, char *options) case Opt_nolazytime: sb->s_flags &= ~MS_LAZYTIME; break; +#ifdef CONFIG_QUOTA + case Opt_quota: + case Opt_usrquota: + set_opt(sbi, USRQUOTA); + break; + case Opt_grpquota: + set_opt(sbi, GRPQUOTA); + break; + case Opt_prjquota: + set_opt(sbi, PRJQUOTA); + break; + case Opt_usrjquota: + ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_grpjquota: + ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_prjjquota: + ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + if (ret) + return ret; + break; + case Opt_offusrjquota: + ret = f2fs_clear_qf_name(sb, USRQUOTA); + if (ret) + return ret; + break; + case Opt_offgrpjquota: + ret = f2fs_clear_qf_name(sb, GRPQUOTA); + if (ret) + return ret; + break; + case Opt_offprjjquota: + ret = f2fs_clear_qf_name(sb, PRJQUOTA); + if (ret) + return ret; + break; + case Opt_jqfmt_vfsold: + sbi->s_jquota_fmt = QFMT_VFS_OLD; + break; + case Opt_jqfmt_vfsv0: + sbi->s_jquota_fmt = QFMT_VFS_V0; + break; + case Opt_jqfmt_vfsv1: + sbi->s_jquota_fmt = QFMT_VFS_V1; + break; + case Opt_noquota: + clear_opt(sbi, QUOTA); + clear_opt(sbi, USRQUOTA); + clear_opt(sbi, GRPQUOTA); + clear_opt(sbi, PRJQUOTA); + break; +#else + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + case Opt_prjquota: + case Opt_usrjquota: + case Opt_grpjquota: + case Opt_prjjquota: + case Opt_offusrjquota: + case Opt_offgrpjquota: + case Opt_offprjjquota: + case Opt_jqfmt_vfsold: + case Opt_jqfmt_vfsv0: + case Opt_jqfmt_vfsv1: + case Opt_noquota: + f2fs_msg(sb, KERN_INFO, + "quota operations not supported"); + break; +#endif default: f2fs_msg(sb, KERN_ERR, "Unrecognized mount option \"%s\" or missing value", @@ -571,6 +593,10 @@ static int parse_options(struct super_block *sb, char *options) return -EINVAL; } } +#ifdef CONFIG_QUOTA + if (f2fs_check_quota_options(sbi)) + return -EINVAL; +#endif if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) { f2fs_msg(sb, KERN_ERR, @@ -603,14 +629,22 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) mutex_init(&fi->inmem_lock); init_rwsem(&fi->dio_rwsem[READ]); init_rwsem(&fi->dio_rwsem[WRITE]); + init_rwsem(&fi->i_mmap_sem); + init_rwsem(&fi->i_xattr_sem); +#ifdef CONFIG_QUOTA + memset(&fi->i_dquot, 0, sizeof(fi->i_dquot)); + fi->i_reserved_quota = 0; +#endif /* Will be used by directory only */ fi->i_dir_level = F2FS_SB(sb)->dir_level; + return &fi->vfs_inode; } static int f2fs_drop_inode(struct inode *inode) { + int ret; /* * This is to avoid a deadlock condition like below. * writeback_single_inode(inode) @@ -643,10 +677,12 @@ static int f2fs_drop_inode(struct inode *inode) spin_lock(&inode->i_lock); atomic_dec(&inode->i_count); } + trace_f2fs_drop_inode(inode, 0); return 0; } - - return generic_drop_inode(inode); + ret = generic_drop_inode(inode); + trace_f2fs_drop_inode(inode, ret); + return ret; } int f2fs_inode_dirtied(struct inode *inode, bool sync) @@ -744,15 +780,9 @@ static void destroy_device_list(struct f2fs_sb_info *sbi) static void f2fs_put_super(struct super_block *sb) { struct f2fs_sb_info *sbi = F2FS_SB(sb); + int i; - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - kobject_del(&sbi->s_kobj); - - stop_gc_thread(sbi); + f2fs_quota_off_umount(sb); /* prevent remaining shrinker jobs */ mutex_lock(&sbi->umount_mutex); @@ -771,7 +801,14 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bio(sbi, NULL_ADDR); + f2fs_wait_discard_bios(sbi); + + if (f2fs_discard_en(sbi) && !sbi->discard_blks) { + struct cp_control cpc = { + .reason = CP_UMOUNT | CP_TRIMMED, + }; + write_checkpoint(sbi, &cpc); + } /* write_checkpoint can update stat informaion */ f2fs_destroy_stats(sbi); @@ -786,7 +823,7 @@ static void f2fs_put_super(struct super_block *sb) mutex_unlock(&sbi->umount_mutex); /* our cp_error case, we can wait for any writeback page */ - f2fs_flush_merged_bios(sbi); + f2fs_flush_merged_writes(sbi); iput(sbi->node_inode); iput(sbi->meta_inode); @@ -796,8 +833,8 @@ static void f2fs_put_super(struct super_block *sb) destroy_segment_manager(sbi); kfree(sbi->ckpt); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); + + f2fs_unregister_sysfs(sbi); sb->s_fs_info = NULL; if (sbi->s_chksum_driver) @@ -805,8 +842,15 @@ static void f2fs_put_super(struct super_block *sb) kfree(sbi->raw_super); destroy_device_list(sbi); - + if (sbi->write_io_dummy) + mempool_destroy(sbi->write_io_dummy); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif destroy_percpu_info(sbi); + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); kfree(sbi); } @@ -817,6 +861,9 @@ int f2fs_sync_fs(struct super_block *sb, int sync) trace_f2fs_sync_fs(sb, sync); + if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) + return -EAGAIN; + if (sync) { struct cp_control cpc; @@ -851,12 +898,55 @@ static int f2fs_unfreeze(struct super_block *sb) return 0; } +#ifdef CONFIG_QUOTA +static int f2fs_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct f2fs_sb_info *sbi = F2FS_SB(sb); u64 id = huge_encode_dev(sb->s_bdev->bd_dev); block_t total_count, user_block_count, start_count, ovp_count; + u64 avail_node_count; total_count = le64_to_cpu(sbi->raw_super->block_count); user_block_count = sbi->user_block_count; @@ -867,19 +957,67 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; buf->f_bfree = user_block_count - valid_user_blocks(sbi) + ovp_count; - buf->f_bavail = user_block_count - valid_user_blocks(sbi); + buf->f_bavail = user_block_count - valid_user_blocks(sbi) - + sbi->reserved_blocks; - buf->f_files = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; - buf->f_ffree = min(buf->f_files - valid_node_count(sbi), - buf->f_bavail); + avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; + + if (avail_node_count > user_block_count) { + buf->f_files = user_block_count; + buf->f_ffree = buf->f_bavail; + } else { + buf->f_files = avail_node_count; + buf->f_ffree = min(avail_node_count - valid_node_count(sbi), + buf->f_bavail); + } buf->f_namelen = F2FS_NAME_LEN; buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); +#ifdef CONFIG_QUOTA + if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + } +#endif return 0; } +static inline void f2fs_show_quota_options(struct seq_file *seq, + struct super_block *sb) +{ +#ifdef CONFIG_QUOTA + struct f2fs_sb_info *sbi = F2FS_SB(sb); + + if (sbi->s_jquota_fmt) { + char *fmtname = ""; + + switch (sbi->s_jquota_fmt) { + case QFMT_VFS_OLD: + fmtname = "vfsold"; + break; + case QFMT_VFS_V0: + fmtname = "vfsv0"; + break; + case QFMT_VFS_V1: + fmtname = "vfsv1"; + break; + } + seq_printf(seq, ",jqfmt=%s", fmtname); + } + + if (sbi->s_qf_names[USRQUOTA]) + seq_show_option(seq, "usrjquota", sbi->s_qf_names[USRQUOTA]); + + if (sbi->s_qf_names[GRPQUOTA]) + seq_show_option(seq, "grpjquota", sbi->s_qf_names[GRPQUOTA]); + + if (sbi->s_qf_names[PRJQUOTA]) + seq_show_option(seq, "prjjquota", sbi->s_qf_names[PRJQUOTA]); +#endif +} + static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb); @@ -897,7 +1035,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) if (test_opt(sbi, DISCARD)) seq_puts(seq, ",discard"); if (test_opt(sbi, NOHEAP)) - seq_puts(seq, ",no_heap_alloc"); + seq_puts(seq, ",no_heap"); + else + seq_puts(seq, ",heap"); #ifdef CONFIG_F2FS_FS_XATTR if (test_opt(sbi, XATTR_USER)) seq_puts(seq, ",user_xattr"); @@ -905,6 +1045,8 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_puts(seq, ",nouser_xattr"); if (test_opt(sbi, INLINE_XATTR)) seq_puts(seq, ",inline_xattr"); + else + seq_puts(seq, ",noinline_xattr"); #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL if (test_opt(sbi, POSIX_ACL)) @@ -943,87 +1085,37 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) seq_printf(seq, ",active_logs=%u", sbi->active_logs); if (F2FS_IO_SIZE_BITS(sbi)) seq_printf(seq, ",io_size=%uKB", F2FS_IO_SIZE_KB(sbi)); +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (test_opt(sbi, FAULT_INJECTION)) + seq_printf(seq, ",fault_injection=%u", + sbi->fault_info.inject_rate); +#endif +#ifdef CONFIG_QUOTA + if (test_opt(sbi, QUOTA)) + seq_puts(seq, ",quota"); + if (test_opt(sbi, USRQUOTA)) + seq_puts(seq, ",usrquota"); + if (test_opt(sbi, GRPQUOTA)) + seq_puts(seq, ",grpquota"); + if (test_opt(sbi, PRJQUOTA)) + seq_puts(seq, ",prjquota"); +#endif + f2fs_show_quota_options(seq, sbi->sb); return 0; } -static int segment_info_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i; - - seq_puts(seq, "format: segment_type|valid_blocks\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - if ((i % 10) == 0) - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u", se->type, - get_valid_blocks(sbi, i, 1)); - if ((i % 10) == 9 || i == (total_segs - 1)) - seq_putc(seq, '\n'); - else - seq_putc(seq, ' '); - } - - return 0; -} - -static int segment_bits_seq_show(struct seq_file *seq, void *offset) -{ - struct super_block *sb = seq->private; - struct f2fs_sb_info *sbi = F2FS_SB(sb); - unsigned int total_segs = - le32_to_cpu(sbi->raw_super->segment_count_main); - int i, j; - - seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" - "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); - - for (i = 0; i < total_segs; i++) { - struct seg_entry *se = get_seg_entry(sbi, i); - - seq_printf(seq, "%-10d", i); - seq_printf(seq, "%d|%-3u|", se->type, - get_valid_blocks(sbi, i, 1)); - for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) - seq_printf(seq, " %.2x", se->cur_valid_map[j]); - seq_putc(seq, '\n'); - } - return 0; -} - -#define F2FS_PROC_FILE_DEF(_name) \ -static int _name##_open_fs(struct inode *inode, struct file *file) \ -{ \ - return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ -} \ - \ -static const struct file_operations f2fs_seq_##_name##_fops = { \ - .owner = THIS_MODULE, \ - .open = _name##_open_fs, \ - .read = seq_read, \ - .llseek = seq_lseek, \ - .release = single_release, \ -}; - -F2FS_PROC_FILE_DEF(segment_info); -F2FS_PROC_FILE_DEF(segment_bits); - static void default_options(struct f2fs_sb_info *sbi) { /* init some FS parameters */ sbi->active_logs = NR_CURSEG_TYPE; set_opt(sbi, BG_GC); + set_opt(sbi, INLINE_XATTR); set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, EXTENT_CACHE); + set_opt(sbi, NOHEAP); sbi->sb->s_flags |= MS_LAZYTIME; set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_mounted_blkzoned(sbi->sb)) { @@ -1049,6 +1141,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct f2fs_mount_info org_mount_opt; + unsigned long old_sb_flags; int err, active_logs; bool need_restart_gc = false; bool need_stop_gc = false; @@ -1056,14 +1149,37 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) #ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info ffi = sbi->fault_info; #endif +#ifdef CONFIG_QUOTA + int s_jquota_fmt; + char *s_qf_names[MAXQUOTAS]; + int i, j; +#endif /* * Save the old mount options in case we * need to restore them. */ org_mount_opt = sbi->mount_opt; + old_sb_flags = sb->s_flags; active_logs = sbi->active_logs; +#ifdef CONFIG_QUOTA + s_jquota_fmt = sbi->s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + s_qf_names[i] = kstrdup(sbi->s_qf_names[i], + GFP_KERNEL); + if (!s_qf_names[i]) { + for (j = 0; j < i; j++) + kfree(s_qf_names[j]); + return -ENOMEM; + } + } else { + s_qf_names[i] = NULL; + } + } +#endif + /* recover superblocks we couldn't write due to previous RO mount */ if (!(*flags & MS_RDONLY) && is_sbi_flag_set(sbi, SBI_NEED_SB_WRITE)) { err = f2fs_commit_super(sbi, false); @@ -1073,7 +1189,6 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_NEED_SB_WRITE); } - sbi->mount_opt.opt = 0; default_options(sbi); /* parse mount options */ @@ -1088,6 +1203,16 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if (f2fs_readonly(sb) && (*flags & MS_RDONLY)) goto skip; + if (!f2fs_readonly(sb) && (*flags & MS_RDONLY)) { + err = dquot_suspend(sb, -1); + if (err < 0) + goto restore_opts; + } else { + /* dquot_resume needs RW */ + sb->s_flags &= ~MS_RDONLY; + dquot_resume(sb, -1); + } + /* disallow enable/disable extent_cache dynamically */ if (no_extent_cache == !!test_opt(sbi, EXTENT_CACHE)) { err = -EINVAL; @@ -1136,6 +1261,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_gc; } skip: +#ifdef CONFIG_QUOTA + /* Release old quota file names */ + for (i = 0; i < MAXQUOTAS; i++) + kfree(s_qf_names[i]); +#endif /* Update the POSIXACL Flag */ sb->s_flags = (sb->s_flags & ~MS_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? MS_POSIXACL : 0); @@ -1150,21 +1280,289 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) stop_gc_thread(sbi); } restore_opts: +#ifdef CONFIG_QUOTA + sbi->s_jquota_fmt = s_jquota_fmt; + for (i = 0; i < MAXQUOTAS; i++) { + kfree(sbi->s_qf_names[i]); + sbi->s_qf_names[i] = s_qf_names[i]; + } +#endif sbi->mount_opt = org_mount_opt; sbi->active_logs = active_logs; + sb->s_flags = old_sb_flags; #ifdef CONFIG_F2FS_FAULT_INJECTION sbi->fault_info = ffi; #endif return err; } -static struct super_operations f2fs_sops = { +#ifdef CONFIG_QUOTA +/* Read data from quotafile */ +static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, + size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + block_t blkidx = F2FS_BYTES_TO_BLK(off); + int offset = off & (sb->s_blocksize - 1); + int tocopy; + size_t toread; + loff_t i_size = i_size_read(inode); + struct page *page; + char *kaddr; + + if (off > i_size) + return 0; + + if (off + len > i_size) + len = i_size - off; + toread = len; + while (toread > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); +repeat: + page = read_mapping_page(mapping, blkidx, NULL); + if (IS_ERR(page)) + return PTR_ERR(page); + + lock_page(page); + + if (unlikely(page->mapping != mapping)) { + f2fs_put_page(page, 1); + goto repeat; + } + if (unlikely(!PageUptodate(page))) { + f2fs_put_page(page, 1); + return -EIO; + } + + kaddr = kmap_atomic(page); + memcpy(data, kaddr + offset, tocopy); + kunmap_atomic(kaddr); + f2fs_put_page(page, 1); + + offset = 0; + toread -= tocopy; + data += tocopy; + blkidx++; + } + return len; +} + +/* Write to quotafile */ +static ssize_t f2fs_quota_write(struct super_block *sb, int type, + const char *data, size_t len, loff_t off) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + struct address_space *mapping = inode->i_mapping; + const struct address_space_operations *a_ops = mapping->a_ops; + int offset = off & (sb->s_blocksize - 1); + size_t towrite = len; + struct page *page; + char *kaddr; + int err = 0; + int tocopy; + + while (towrite > 0) { + tocopy = min_t(unsigned long, sb->s_blocksize - offset, + towrite); + + err = a_ops->write_begin(NULL, mapping, off, tocopy, 0, + &page, NULL); + if (unlikely(err)) + break; + + kaddr = kmap_atomic(page); + memcpy(kaddr + offset, data, tocopy); + kunmap_atomic(kaddr); + flush_dcache_page(page); + + a_ops->write_end(NULL, mapping, off, tocopy, tocopy, + page, NULL); + offset = 0; + towrite -= tocopy; + off += tocopy; + data += tocopy; + cond_resched(); + } + + if (len == towrite) + return 0; + inode->i_version++; + inode->i_mtime = inode->i_ctime = current_time(inode); + f2fs_mark_inode_dirty_sync(inode, false); + return len - towrite; +} + +static struct dquot **f2fs_get_dquots(struct inode *inode) +{ + return F2FS_I(inode)->i_dquot; +} + +static qsize_t *f2fs_get_reserved_space(struct inode *inode) +{ + return &F2FS_I(inode)->i_reserved_quota; +} + +static int f2fs_quota_on_mount(struct f2fs_sb_info *sbi, int type) +{ + return dquot_quota_on_mount(sbi->sb, sbi->s_qf_names[type], + sbi->s_jquota_fmt, type); +} + +void f2fs_enable_quota_files(struct f2fs_sb_info *sbi) +{ + int i, ret; + + for (i = 0; i < MAXQUOTAS; i++) { + if (sbi->s_qf_names[i]) { + ret = f2fs_quota_on_mount(sbi, i); + if (ret < 0) + f2fs_msg(sbi->sb, KERN_ERR, + "Cannot turn on journaled " + "quota: error %d", ret); + } + } +} + +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + struct quota_info *dqopt = sb_dqopt(sb); + int cnt; + int ret; + + ret = dquot_writeback_dquots(sb, type); + if (ret) + return ret; + + /* + * Now when everything is written we can discard the pagecache so + * that userspace sees the changes. + */ + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + if (type != -1 && cnt != type) + continue; + if (!sb_has_quota_active(sb, cnt)) + continue; + + ret = filemap_write_and_wait(dqopt->files[cnt]->i_mapping); + if (ret) + return ret; + + inode_lock(dqopt->files[cnt]); + truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); + inode_unlock(dqopt->files[cnt]); + } + return 0; +} + +static int f2fs_quota_on(struct super_block *sb, int type, int format_id, + struct path *path) +{ + struct inode *inode; + int err; + + err = f2fs_quota_sync(sb, type); + if (err) + return err; + + err = dquot_quota_on(sb, type, format_id, path); + if (err) + return err; + + inode = d_inode(path->dentry); + + inode_lock(inode); + F2FS_I(inode)->i_flags |= FS_NOATIME_FL | FS_IMMUTABLE_FL; + inode_set_flags(inode, S_NOATIME | S_IMMUTABLE, + S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); + + return 0; +} + +static int f2fs_quota_off(struct super_block *sb, int type) +{ + struct inode *inode = sb_dqopt(sb)->files[type]; + int err; + + if (!inode || !igrab(inode)) + return dquot_quota_off(sb, type); + + f2fs_quota_sync(sb, type); + + err = dquot_quota_off(sb, type); + if (err) + goto out_put; + + inode_lock(inode); + F2FS_I(inode)->i_flags &= ~(FS_NOATIME_FL | FS_IMMUTABLE_FL); + inode_set_flags(inode, 0, S_NOATIME | S_IMMUTABLE); + inode_unlock(inode); + f2fs_mark_inode_dirty_sync(inode, false); +out_put: + iput(inode); + return err; +} + +void f2fs_quota_off_umount(struct super_block *sb) +{ + int type; + + for (type = 0; type < MAXQUOTAS; type++) + f2fs_quota_off(sb, type); +} + +#if 0 +int f2fs_get_projid(struct inode *inode, kprojid_t *projid) +{ + *projid = F2FS_I(inode)->i_projid; + return 0; +} +#endif + +static const struct dquot_operations f2fs_quota_operations = { + .get_reserved_space = f2fs_get_reserved_space, + .write_dquot = dquot_commit, + .acquire_dquot = dquot_acquire, + .release_dquot = dquot_release, + .mark_dirty = dquot_mark_dquot_dirty, + .write_info = dquot_commit_info, + .alloc_dquot = dquot_alloc, + .destroy_dquot = dquot_destroy, +#if 0 + .get_projid = f2fs_get_projid, + .get_next_id = dquot_get_next_id, +#endif +}; + +static const struct quotactl_ops f2fs_quotactl_ops = { + .quota_on = f2fs_quota_on, + .quota_off = f2fs_quota_off, + .quota_sync = f2fs_quota_sync, + .get_state = dquot_get_state, + .set_info = dquot_set_dqinfo, + .get_dqblk = dquot_get_dqblk, + .set_dqblk = dquot_set_dqblk, +}; +#else +void f2fs_quota_off_umount(struct super_block *sb) +{ +} +#endif + +static const struct super_operations f2fs_sops = { .alloc_inode = f2fs_alloc_inode, .drop_inode = f2fs_drop_inode, .destroy_inode = f2fs_destroy_inode, .write_inode = f2fs_write_inode, .dirty_inode = f2fs_dirty_inode, .show_options = f2fs_show_options, +#ifdef CONFIG_QUOTA + .quota_read = f2fs_quota_read, + .quota_write = f2fs_quota_write, + .get_dquots = f2fs_get_dquots, +#endif .evict_inode = f2fs_evict_inode, .put_super = f2fs_put_super, .sync_fs = f2fs_sync_fs, @@ -1182,12 +1580,6 @@ static int f2fs_get_context(struct inode *inode, void *ctx, size_t len) ctx, len, NULL); } -static int f2fs_key_prefix(struct inode *inode, u8 **key) -{ - *key = F2FS_I_SB(inode)->key_prefix; - return F2FS_I_SB(inode)->key_prefix_size; -} - static int f2fs_set_context(struct inode *inode, const void *ctx, size_t len, void *fs_data) { @@ -1202,16 +1594,16 @@ static unsigned f2fs_max_namelen(struct inode *inode) inode->i_sb->s_blocksize : F2FS_NAME_LEN; } -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { + .key_prefix = "f2fs:", .get_context = f2fs_get_context, - .key_prefix = f2fs_key_prefix, .set_context = f2fs_set_context, .is_encrypted = f2fs_encrypted_inode, .empty_dir = f2fs_empty_dir, .max_namelen = f2fs_max_namelen, }; #else -static struct fscrypt_operations f2fs_cryptops = { +static const struct fscrypt_operations f2fs_cryptops = { .is_encrypted = f2fs_encrypted_inode, }; #endif @@ -1263,9 +1655,16 @@ static const struct export_operations f2fs_export_ops = { static loff_t max_file_blocks(void) { - loff_t result = (DEF_ADDRS_PER_INODE - F2FS_INLINE_XATTR_ADDRS); + loff_t result = 0; loff_t leaf_count = ADDRS_PER_BLOCK; + /* + * note: previously, result is equal to (DEF_ADDRS_PER_INODE - + * F2FS_INLINE_XATTR_ADDRS), but now f2fs try to reserve more + * space in inode.i_addr, it will be more safe to reassign + * result as zero. + */ + /* two direct node blocks */ result += (leaf_count * 2); @@ -1467,6 +1866,13 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return 1; } + if (le32_to_cpu(raw_super->segment_count) > F2FS_MAX_SEGMENT) { + f2fs_msg(sb, KERN_INFO, + "Invalid segment count (%u)", + le32_to_cpu(raw_super->segment_count)); + return 1; + } + /* check CP/SIT/NAT/SSA/MAIN_AREA area boundary */ if (sanity_check_area_boundary(sbi, bh)) return 1; @@ -1480,6 +1886,8 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned int ovp_segments, reserved_segments; + unsigned int main_segs, blocks_per_seg; + int i; total = le32_to_cpu(raw_super->segment_count); fsmeta = le32_to_cpu(raw_super->segment_count_ckpt); @@ -1501,6 +1909,20 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) return 1; } + main_segs = le32_to_cpu(raw_super->segment_count_main); + blocks_per_seg = sbi->blocks_per_seg; + + for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_node_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_node_blkoff[i]) >= blocks_per_seg) + return 1; + } + for (i = 0; i < NR_CURSEG_DATA_TYPE; i++) { + if (le32_to_cpu(ckpt->cur_data_segno[i]) >= main_segs || + le16_to_cpu(ckpt->cur_data_blkoff[i]) >= blocks_per_seg) + return 1; + } + if (unlikely(f2fs_cp_error(sbi))) { f2fs_msg(sbi->sb, KERN_ERR, "A bug case: need to run fsck"); return 1; @@ -1511,7 +1933,7 @@ int sanity_check_ckpt(struct f2fs_sb_info *sbi) static void init_sb_info(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = sbi->raw_super; - int i; + int i, j; sbi->log_sectors_per_block = le32_to_cpu(raw_super->log_sectors_per_block); @@ -1539,17 +1961,14 @@ static void init_sb_info(struct f2fs_sb_info *sbi) for (i = 0; i < NR_COUNT_TYPE; i++) atomic_set(&sbi->nr_pages[i], 0); + atomic_set(&sbi->wb_sync_req, 0); + INIT_LIST_HEAD(&sbi->s_list); mutex_init(&sbi->umount_mutex); - mutex_init(&sbi->wio_mutex[NODE]); - mutex_init(&sbi->wio_mutex[DATA]); + for (i = 0; i < NR_PAGE_TYPE - 1; i++) + for (j = HOT; j < NR_TEMP_TYPE; j++) + mutex_init(&sbi->wio_mutex[i][j]); spin_lock_init(&sbi->cp_lock); - -#ifdef CONFIG_F2FS_FS_ENCRYPTION - memcpy(sbi->key_prefix, F2FS_KEY_DESC_PREFIX, - F2FS_KEY_DESC_PREFIX_SIZE); - sbi->key_prefix_size = F2FS_KEY_DESC_PREFIX_SIZE; -#endif } static int init_percpu_info(struct f2fs_sb_info *sbi) @@ -1579,16 +1998,16 @@ static int init_blkz_info(struct f2fs_sb_info *sbi, int devi) return 0; if (sbi->blocks_per_blkz && sbi->blocks_per_blkz != - SECTOR_TO_BLOCK(bdev_zone_size(bdev))) + SECTOR_TO_BLOCK(bdev_zone_sectors(bdev))) return -EINVAL; - sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_size(bdev)); + sbi->blocks_per_blkz = SECTOR_TO_BLOCK(bdev_zone_sectors(bdev)); if (sbi->log_blocks_per_blkz && sbi->log_blocks_per_blkz != __ilog2_u32(sbi->blocks_per_blkz)) return -EINVAL; sbi->log_blocks_per_blkz = __ilog2_u32(sbi->blocks_per_blkz); FDEV(devi).nr_blkz = SECTOR_TO_BLOCK(nr_sectors) >> sbi->log_blocks_per_blkz; - if (nr_sectors & (bdev_zone_size(bdev) - 1)) + if (nr_sectors & (bdev_zone_sectors(bdev) - 1)) FDEV(devi).nr_blkz++; FDEV(devi).blkz_type = kmalloc(FDEV(devi).nr_blkz, GFP_KERNEL); @@ -1724,36 +2143,59 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) static int f2fs_scan_devices(struct f2fs_sb_info *sbi) { struct f2fs_super_block *raw_super = F2FS_RAW_SUPER(sbi); + unsigned int max_devices = MAX_DEVICES; int i; - for (i = 0; i < MAX_DEVICES; i++) { - if (!RDEV(i).path[0]) + /* Initialize single device information */ + if (!RDEV(0).path[0]) { +#ifdef CONFIG_BLK_DEV_ZONED + if (!bdev_is_zoned(sbi->sb->s_bdev)) return 0; + max_devices = 1; +#else + return 0; +#endif + } - if (i == 0) { - sbi->devs = kzalloc(sizeof(struct f2fs_dev_info) * - MAX_DEVICES, GFP_KERNEL); - if (!sbi->devs) - return -ENOMEM; - } + /* + * Initialize multiple devices information, or single + * zoned block device information. + */ + sbi->devs = kcalloc(max_devices, sizeof(struct f2fs_dev_info), + GFP_KERNEL); + if (!sbi->devs) + return -ENOMEM; - memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); - FDEV(i).total_segments = le32_to_cpu(RDEV(i).total_segments); - if (i == 0) { - FDEV(i).start_blk = 0; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1 + - le32_to_cpu(raw_super->segment0_blkaddr); - } else { - FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; - FDEV(i).end_blk = FDEV(i).start_blk + - (FDEV(i).total_segments << - sbi->log_blocks_per_seg) - 1; - } + for (i = 0; i < max_devices; i++) { - FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + if (i > 0 && !RDEV(i).path[0]) + break; + + if (max_devices == 1) { + /* Single zoned block device mount */ + FDEV(0).bdev = + blkdev_get_by_dev(sbi->sb->s_bdev->bd_dev, sbi->sb->s_mode, sbi->sb->s_type); + } else { + /* Multi-device mount */ + memcpy(FDEV(i).path, RDEV(i).path, MAX_PATH_LEN); + FDEV(i).total_segments = + le32_to_cpu(RDEV(i).total_segments); + if (i == 0) { + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1 + + le32_to_cpu(raw_super->segment0_blkaddr); + } else { + FDEV(i).start_blk = FDEV(i - 1).end_blk + 1; + FDEV(i).end_blk = FDEV(i).start_blk + + (FDEV(i).total_segments << + sbi->log_blocks_per_seg) - 1; + } + FDEV(i).bdev = blkdev_get_by_path(FDEV(i).path, + sbi->sb->s_mode, sbi->sb->s_type); + } if (IS_ERR(FDEV(i).bdev)) return PTR_ERR(FDEV(i).bdev); @@ -1773,6 +2215,8 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) "Failed to initialize F2FS blkzone information"); return -EINVAL; } + if (max_devices == 1) + break; f2fs_msg(sbi->sb, KERN_INFO, "Mount Device [%2d]: %20s, %8u, %8x - %8x (zone: %s)", i, FDEV(i).path, @@ -1841,6 +2285,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; sbi->raw_super = raw_super; + /* precompute checksum seed for metadata */ + if (f2fs_sb_has_inode_chksum(sb)) + sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, + sizeof(raw_super->uuid)); + /* * The BLKZONED feature indicates that the drive was formatted with * zone alignment optimization. This is optional for host-aware @@ -1850,6 +2299,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (f2fs_sb_mounted_blkzoned(sb)) { f2fs_msg(sb, KERN_ERR, "Zoned block device support is not enabled\n"); + err = -EOPNOTSUPP; goto free_sb_buf; } #endif @@ -1871,6 +2321,12 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sb->s_max_links = F2FS_LINK_MAX; get_random_bytes(&sbi->s_next_generation, sizeof(u32)); +#ifdef CONFIG_QUOTA + sb->dq_op = &f2fs_quota_operations; + sb->s_qcop = &f2fs_quotactl_ops; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; +#endif + sb->s_op = &f2fs_sops; sb->s_cop = &f2fs_cryptops; sb->s_xattr = f2fs_xattr_handlers; @@ -1886,18 +2342,34 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->gc_mutex); mutex_init(&sbi->cp_mutex); init_rwsem(&sbi->node_write); + init_rwsem(&sbi->node_change); /* disallow all the data/node/meta page writes */ set_sbi_flag(sbi, SBI_POR_DOING); spin_lock_init(&sbi->stat_lock); - init_rwsem(&sbi->read_io.io_rwsem); - sbi->read_io.sbi = sbi; - sbi->read_io.bio = NULL; + /* init iostat info */ + spin_lock_init(&sbi->iostat_lock); + sbi->iostat_enable = false; + for (i = 0; i < NR_PAGE_TYPE; i++) { - init_rwsem(&sbi->write_io[i].io_rwsem); - sbi->write_io[i].sbi = sbi; - sbi->write_io[i].bio = NULL; + int n = (i == META) ? 1: NR_TEMP_TYPE; + int j; + + sbi->write_io[i] = kmalloc(n * sizeof(struct f2fs_bio_info), + GFP_KERNEL); + if (!sbi->write_io[i]) { + err = -ENOMEM; + goto free_options; + } + + for (j = HOT; j < n; j++) { + init_rwsem(&sbi->write_io[i][j].io_rwsem); + sbi->write_io[i][j].sbi = sbi; + sbi->write_io[i][j].bio = NULL; + spin_lock_init(&sbi->write_io[i][j].io_lock); + INIT_LIST_HEAD(&sbi->write_io[i][j].io_list); + } } init_rwsem(&sbi->cp_rwsem); @@ -1910,9 +2382,11 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (F2FS_IO_SIZE(sbi) > 1) { sbi->write_io_dummy = - mempool_create_page_pool(F2FS_IO_SIZE(sbi) - 1, 0); - if (!sbi->write_io_dummy) + mempool_create_page_pool(2 * (F2FS_IO_SIZE(sbi) - 1), 0); + if (!sbi->write_io_dummy) { + err = -ENOMEM; goto free_options; + } } /* get an inode for meta space */ @@ -1944,6 +2418,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->total_valid_block_count = le64_to_cpu(sbi->ckpt->valid_block_count); sbi->last_valid_block_count = sbi->total_valid_block_count; + sbi->reserved_blocks = 0; for (i = 0; i < NR_INODE_TYPE; i++) { INIT_LIST_HEAD(&sbi->inode_list[i]); @@ -1991,10 +2466,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) f2fs_join_shrinker(sbi); - /* if there are nt orphan nodes free them */ - err = recover_orphan_inodes(sbi); + err = f2fs_build_stats(sbi); if (err) - goto free_node_inode; + goto free_nm; /* read root inode and dentry */ root = f2fs_iget(sb, F2FS_ROOT_INO(sbi)); @@ -2015,26 +2489,14 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) goto free_root_inode; } - err = f2fs_build_stats(sbi); + err = f2fs_register_sysfs(sbi); if (err) goto free_root_inode; - if (f2fs_proc_root) - sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); - - if (sbi->s_proc) { - proc_create_data("segment_info", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_info_fops, sb); - proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, - &f2fs_seq_segment_bits_fops, sb); - } - - sbi->s_kobj.kset = f2fs_kset; - init_completion(&sbi->s_kobj_unregister); - err = kobject_init_and_add(&sbi->s_kobj, &f2fs_ktype, NULL, - "%s", sb->s_id); + /* if there are nt orphan nodes free them */ + err = recover_orphan_inodes(sbi); if (err) - goto free_proc; + goto free_sysfs; /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) { @@ -2045,7 +2507,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) if (bdev_read_only(sb->s_bdev) && !is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { err = -EROFS; - goto free_kobj; + goto free_meta; } if (need_fsck) @@ -2059,7 +2521,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) need_fsck = true; f2fs_msg(sb, KERN_ERR, "Cannot recover all fsync data errno=%d", err); - goto free_kobj; + goto free_meta; } } else { err = recover_fsync_data(sbi, true); @@ -2068,7 +2530,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) err = -EINVAL; f2fs_msg(sb, KERN_ERR, "Need to recover fsync data"); - goto free_kobj; + goto free_sysfs; } } skip_recovery: @@ -2083,7 +2545,7 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) /* After POR, we can run background GC thread.*/ err = start_gc_thread(sbi); if (err) - goto free_kobj; + goto free_meta; } kfree(options); @@ -2095,22 +2557,23 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) sbi->valid_super_block ? 1 : 2, err); } + f2fs_msg(sbi->sb, KERN_NOTICE, "Mounted with checkpoint version = %llx", + cur_cp_version(F2FS_CKPT(sbi))); f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); return 0; -free_kobj: +free_meta: f2fs_sync_inode_meta(sbi); - kobject_del(&sbi->s_kobj); - kobject_put(&sbi->s_kobj); - wait_for_completion(&sbi->s_kobj_unregister); -free_proc: - if (sbi->s_proc) { - remove_proc_entry("segment_info", sbi->s_proc); - remove_proc_entry("segment_bits", sbi->s_proc); - remove_proc_entry(sb->s_id, f2fs_proc_root); - } - f2fs_destroy_stats(sbi); + /* + * Some dirty meta pages can be produced by recover_orphan_inodes() + * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() + * followed by write_checkpoint() through f2fs_write_node_pages(), which + * falls into an infinite loop in sync_meta_pages(). + */ + truncate_inode_pages_final(META_MAPPING(sbi)); +free_sysfs: + f2fs_unregister_sysfs(sbi); free_root_inode: dput(sb->s_root); sb->s_root = NULL; @@ -2119,15 +2582,9 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) mutex_lock(&sbi->umount_mutex); release_ino_entry(sbi, true); f2fs_leave_shrinker(sbi); - /* - * Some dirty meta pages can be produced by recover_orphan_inodes() - * failed by EIO. Then, iput(node_inode) can trigger balance_fs_bg() - * followed by write_checkpoint() through f2fs_write_node_pages(), which - * falls into an infinite loop in sync_meta_pages(). - */ - truncate_inode_pages_final(META_MAPPING(sbi)); iput(sbi->node_inode); mutex_unlock(&sbi->umount_mutex); + f2fs_destroy_stats(sbi); free_nm: destroy_node_manager(sbi); free_sm: @@ -2141,7 +2598,13 @@ static int f2fs_fill_super(struct super_block *sb, void *data, int silent) free_io_dummy: mempool_destroy(sbi->write_io_dummy); free_options: + for (i = 0; i < NR_PAGE_TYPE; i++) + kfree(sbi->write_io[i]); destroy_percpu_info(sbi); +#ifdef CONFIG_QUOTA + for (i = 0; i < MAXQUOTAS; i++) + kfree(sbi->s_qf_names[i]); +#endif kfree(options); free_sb_buf: kfree(raw_super); @@ -2167,8 +2630,11 @@ static struct dentry *f2fs_mount(struct file_system_type *fs_type, int flags, static void kill_f2fs_super(struct super_block *sb) { - if (sb->s_root) + if (sb->s_root) { set_sbi_flag(F2FS_SB(sb), SBI_IS_CLOSE); + stop_gc_thread(F2FS_SB(sb)); + stop_discard_thread(F2FS_SB(sb)); + } kill_block_super(sb); } @@ -2222,30 +2688,26 @@ static int __init init_f2fs_fs(void) err = create_extent_cache(); if (err) goto free_checkpoint_caches; - f2fs_kset = kset_create_and_add("f2fs", NULL, fs_kobj); - if (!f2fs_kset) { - err = -ENOMEM; + err = f2fs_init_sysfs(); + if (err) goto free_extent_cache; - } err = register_shrinker(&f2fs_shrinker_info); if (err) - goto free_kset; - + goto free_sysfs; err = register_filesystem(&f2fs_fs_type); if (err) goto free_shrinker; err = f2fs_create_root_stats(); if (err) goto free_filesystem; - f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); return 0; free_filesystem: unregister_filesystem(&f2fs_fs_type); free_shrinker: unregister_shrinker(&f2fs_shrinker_info); -free_kset: - kset_unregister(f2fs_kset); +free_sysfs: + f2fs_exit_sysfs(); free_extent_cache: destroy_extent_cache(); free_checkpoint_caches: @@ -2262,11 +2724,10 @@ static int __init init_f2fs_fs(void) static void __exit exit_f2fs_fs(void) { - remove_proc_entry("fs/f2fs", NULL); f2fs_destroy_root_stats(); unregister_filesystem(&f2fs_fs_type); unregister_shrinker(&f2fs_shrinker_info); - kset_unregister(f2fs_kset); + f2fs_exit_sysfs(); destroy_extent_cache(); destroy_checkpoint_caches(); destroy_segment_manager_caches(); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c new file mode 100644 index 000000000000..e2c258f717cd --- /dev/null +++ b/fs/f2fs/sysfs.c @@ -0,0 +1,556 @@ +/* + * f2fs sysfs interface + * + * Copyright (c) 2012 Samsung Electronics Co., Ltd. + * http://www.samsung.com/ + * Copyright (c) 2017 Chao Yu + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ +#include +#include +#include + +#include "f2fs.h" +#include "segment.h" +#include "gc.h" + +static struct proc_dir_entry *f2fs_proc_root; + +/* Sysfs support for f2fs */ +enum { + GC_THREAD, /* struct f2fs_gc_thread */ + SM_INFO, /* struct f2fs_sm_info */ + DCC_INFO, /* struct discard_cmd_control */ + NM_INFO, /* struct f2fs_nm_info */ + F2FS_SBI, /* struct f2fs_sb_info */ +#ifdef CONFIG_F2FS_FAULT_INJECTION + FAULT_INFO_RATE, /* struct f2fs_fault_info */ + FAULT_INFO_TYPE, /* struct f2fs_fault_info */ +#endif + RESERVED_BLOCKS, +}; + +struct f2fs_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_attr *, struct f2fs_sb_info *, char *); + ssize_t (*store)(struct f2fs_attr *, struct f2fs_sb_info *, + const char *, size_t); + int struct_type; + int offset; + int id; +}; + +static unsigned char *__struct_ptr(struct f2fs_sb_info *sbi, int struct_type) +{ + if (struct_type == GC_THREAD) + return (unsigned char *)sbi->gc_thread; + else if (struct_type == SM_INFO) + return (unsigned char *)SM_I(sbi); + else if (struct_type == DCC_INFO) + return (unsigned char *)SM_I(sbi)->dcc_info; + else if (struct_type == NM_INFO) + return (unsigned char *)NM_I(sbi); + else if (struct_type == F2FS_SBI || struct_type == RESERVED_BLOCKS) + return (unsigned char *)sbi; +#ifdef CONFIG_F2FS_FAULT_INJECTION + else if (struct_type == FAULT_INFO_RATE || + struct_type == FAULT_INFO_TYPE) + return (unsigned char *)&sbi->fault_info; +#endif + return NULL; +} + +static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + return snprintf(buf, PAGE_SIZE, "%llu\n", + (unsigned long long)(sbi->kbytes_written + + BD_PART_WRITTEN(sbi))); +} + +static ssize_t features_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + struct super_block *sb = sbi->sb; + int len = 0; + + if (!sb->s_bdev->bd_part) + return snprintf(buf, PAGE_SIZE, "0\n"); + + if (f2fs_sb_has_crypto(sb)) + len += snprintf(buf, PAGE_SIZE - len, "%s", + "encryption"); + if (f2fs_sb_mounted_blkzoned(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "blkzoned"); + if (f2fs_sb_has_extra_attr(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "extra_attr"); + if (f2fs_sb_has_project_quota(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "projquota"); + if (f2fs_sb_has_inode_chksum(sb)) + len += snprintf(buf + len, PAGE_SIZE - len, "%s%s", + len ? ", " : "", "inode_checksum"); + len += snprintf(buf + len, PAGE_SIZE - len, "\n"); + return len; +} + +static ssize_t f2fs_sbi_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + unsigned char *ptr = NULL; + unsigned int *ui; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + return snprintf(buf, PAGE_SIZE, "%u\n", *ui); +} + +static ssize_t f2fs_sbi_store(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, + const char *buf, size_t count) +{ + unsigned char *ptr; + unsigned long t; + unsigned int *ui; + ssize_t ret; + + ptr = __struct_ptr(sbi, a->struct_type); + if (!ptr) + return -EINVAL; + + ui = (unsigned int *)(ptr + a->offset); + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret < 0) + return ret; +#ifdef CONFIG_F2FS_FAULT_INJECTION + if (a->struct_type == FAULT_INFO_TYPE && t >= (1 << FAULT_MAX)) + return -EINVAL; +#endif + if (a->struct_type == RESERVED_BLOCKS) { + spin_lock(&sbi->stat_lock); + if ((unsigned long)sbi->total_valid_block_count + t > + (unsigned long)sbi->user_block_count) { + spin_unlock(&sbi->stat_lock); + return -EINVAL; + } + *ui = t; + spin_unlock(&sbi->stat_lock); + return count; + } + + if (!strcmp(a->attr.name, "discard_granularity")) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + int i; + + if (t == 0 || t > MAX_PLIST_NUM) + return -EINVAL; + if (t == *ui) + return count; + + mutex_lock(&dcc->cmd_lock); + for (i = 0; i < MAX_PLIST_NUM; i++) { + if (i >= t - 1) + dcc->pend_list_tag[i] |= P_ACTIVE; + else + dcc->pend_list_tag[i] &= (~P_ACTIVE); + } + mutex_unlock(&dcc->cmd_lock); + + *ui = t; + return count; + } + + *ui = t; + + if (!strcmp(a->attr.name, "iostat_enable") && *ui == 0) + f2fs_reset_iostat(sbi); + if (!strcmp(a->attr.name, "gc_urgent") && t == 1 && sbi->gc_thread) { + sbi->gc_thread->gc_wake = 1; + wake_up_interruptible_all(&sbi->gc_thread->gc_wait_queue_head); + wake_up_discard_thread(sbi, true); + } + + return count; +} + +static ssize_t f2fs_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->show ? a->show(a, sbi, buf) : 0; +} + +static ssize_t f2fs_attr_store(struct kobject *kobj, struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + struct f2fs_attr *a = container_of(attr, struct f2fs_attr, attr); + + return a->store ? a->store(a, sbi, buf, len) : 0; +} + +static void f2fs_sb_release(struct kobject *kobj) +{ + struct f2fs_sb_info *sbi = container_of(kobj, struct f2fs_sb_info, + s_kobj); + complete(&sbi->s_kobj_unregister); +} + +enum feat_id { + FEAT_CRYPTO = 0, + FEAT_BLKZONED, + FEAT_ATOMIC_WRITE, + FEAT_EXTRA_ATTR, + FEAT_PROJECT_QUOTA, + FEAT_INODE_CHECKSUM, +}; + +static ssize_t f2fs_feature_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + switch (a->id) { + case FEAT_CRYPTO: + case FEAT_BLKZONED: + case FEAT_ATOMIC_WRITE: + case FEAT_EXTRA_ATTR: + case FEAT_PROJECT_QUOTA: + case FEAT_INODE_CHECKSUM: + return snprintf(buf, PAGE_SIZE, "supported\n"); + } + return 0; +} + +#define F2FS_ATTR_OFFSET(_struct_type, _name, _mode, _show, _store, _offset) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = _mode }, \ + .show = _show, \ + .store = _store, \ + .struct_type = _struct_type, \ + .offset = _offset \ +} + +#define F2FS_RW_ATTR(struct_type, struct_name, name, elname) \ + F2FS_ATTR_OFFSET(struct_type, name, 0644, \ + f2fs_sbi_show, f2fs_sbi_store, \ + offsetof(struct struct_name, elname)) + +#define F2FS_GENERAL_RO_ATTR(name) \ +static struct f2fs_attr f2fs_attr_##name = __ATTR(name, 0444, name##_show, NULL) + +#define F2FS_FEATURE_RO_ATTR(_name, _id) \ +static struct f2fs_attr f2fs_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0444 }, \ + .show = f2fs_feature_show, \ + .id = _id, \ +} + +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent_sleep_time, + urgent_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_min_sleep_time, min_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_max_sleep_time, max_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_no_gc_sleep_time, no_gc_sleep_time); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_idle, gc_idle); +F2FS_RW_ATTR(GC_THREAD, f2fs_gc_kthread, gc_urgent, gc_urgent); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, reclaim_segments, rec_prefree_segments); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, max_small_discards, max_discards); +F2FS_RW_ATTR(DCC_INFO, discard_cmd_control, discard_granularity, discard_granularity); +F2FS_RW_ATTR(RESERVED_BLOCKS, f2fs_sb_info, reserved_blocks, reserved_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, batched_trim_sections, trim_sections); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, ipu_policy, ipu_policy); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_ipu_util, min_ipu_util); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_fsync_blocks, min_fsync_blocks); +F2FS_RW_ATTR(SM_INFO, f2fs_sm_info, min_hot_blocks, min_hot_blocks); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ram_thresh, ram_thresh); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, ra_nid_pages, ra_nid_pages); +F2FS_RW_ATTR(NM_INFO, f2fs_nm_info, dirty_nats_ratio, dirty_nats_ratio); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, max_victim_search, max_victim_search); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, dir_level, dir_level); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, cp_interval, interval_time[CP_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, idle_interval, interval_time[REQ_TIME]); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, iostat_enable, iostat_enable); +#ifdef CONFIG_F2FS_FAULT_INJECTION +F2FS_RW_ATTR(FAULT_INFO_RATE, f2fs_fault_info, inject_rate, inject_rate); +F2FS_RW_ATTR(FAULT_INFO_TYPE, f2fs_fault_info, inject_type, inject_type); +#endif +F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); +F2FS_GENERAL_RO_ATTR(features); + +#ifdef CONFIG_F2FS_FS_ENCRYPTION +F2FS_FEATURE_RO_ATTR(encryption, FEAT_CRYPTO); +#endif +#ifdef CONFIG_BLK_DEV_ZONED +F2FS_FEATURE_RO_ATTR(block_zoned, FEAT_BLKZONED); +#endif +F2FS_FEATURE_RO_ATTR(atomic_write, FEAT_ATOMIC_WRITE); +F2FS_FEATURE_RO_ATTR(extra_attr, FEAT_EXTRA_ATTR); +F2FS_FEATURE_RO_ATTR(project_quota, FEAT_PROJECT_QUOTA); +F2FS_FEATURE_RO_ATTR(inode_checksum, FEAT_INODE_CHECKSUM); + +#define ATTR_LIST(name) (&f2fs_attr_##name.attr) +static struct attribute *f2fs_attrs[] = { + ATTR_LIST(gc_urgent_sleep_time), + ATTR_LIST(gc_min_sleep_time), + ATTR_LIST(gc_max_sleep_time), + ATTR_LIST(gc_no_gc_sleep_time), + ATTR_LIST(gc_idle), + ATTR_LIST(gc_urgent), + ATTR_LIST(reclaim_segments), + ATTR_LIST(max_small_discards), + ATTR_LIST(discard_granularity), + ATTR_LIST(batched_trim_sections), + ATTR_LIST(ipu_policy), + ATTR_LIST(min_ipu_util), + ATTR_LIST(min_fsync_blocks), + ATTR_LIST(min_hot_blocks), + ATTR_LIST(max_victim_search), + ATTR_LIST(dir_level), + ATTR_LIST(ram_thresh), + ATTR_LIST(ra_nid_pages), + ATTR_LIST(dirty_nats_ratio), + ATTR_LIST(cp_interval), + ATTR_LIST(idle_interval), + ATTR_LIST(iostat_enable), +#ifdef CONFIG_F2FS_FAULT_INJECTION + ATTR_LIST(inject_rate), + ATTR_LIST(inject_type), +#endif + ATTR_LIST(lifetime_write_kbytes), + ATTR_LIST(features), + ATTR_LIST(reserved_blocks), + NULL, +}; + +static struct attribute *f2fs_feat_attrs[] = { +#ifdef CONFIG_F2FS_FS_ENCRYPTION + ATTR_LIST(encryption), +#endif +#ifdef CONFIG_BLK_DEV_ZONED + ATTR_LIST(block_zoned), +#endif + ATTR_LIST(atomic_write), + ATTR_LIST(extra_attr), + ATTR_LIST(project_quota), + ATTR_LIST(inode_checksum), + NULL, +}; + +static const struct sysfs_ops f2fs_attr_ops = { + .show = f2fs_attr_show, + .store = f2fs_attr_store, +}; + +static struct kobj_type f2fs_sb_ktype = { + .default_attrs = f2fs_attrs, + .sysfs_ops = &f2fs_attr_ops, + .release = f2fs_sb_release, +}; + +static struct kobj_type f2fs_ktype = { + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kset f2fs_kset = { + .kobj = {.ktype = &f2fs_ktype}, +}; + +static struct kobj_type f2fs_feat_ktype = { + .default_attrs = f2fs_feat_attrs, + .sysfs_ops = &f2fs_attr_ops, +}; + +static struct kobject f2fs_feat = { + .kset = &f2fs_kset, +}; + +static int segment_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i; + + seq_puts(seq, "format: segment_type|valid_blocks\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + if ((i % 10) == 0) + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u", se->type, + get_valid_blocks(sbi, i, false)); + if ((i % 10) == 9 || i == (total_segs - 1)) + seq_putc(seq, '\n'); + else + seq_putc(seq, ' '); + } + + return 0; +} + +static int segment_bits_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + unsigned int total_segs = + le32_to_cpu(sbi->raw_super->segment_count_main); + int i, j; + + seq_puts(seq, "format: segment_type|valid_blocks|bitmaps\n" + "segment_type(0:HD, 1:WD, 2:CD, 3:HN, 4:WN, 5:CN)\n"); + + for (i = 0; i < total_segs; i++) { + struct seg_entry *se = get_seg_entry(sbi, i); + + seq_printf(seq, "%-10d", i); + seq_printf(seq, "%d|%-3u|", se->type, + get_valid_blocks(sbi, i, false)); + for (j = 0; j < SIT_VBLOCK_MAP_SIZE; j++) + seq_printf(seq, " %.2x", se->cur_valid_map[j]); + seq_putc(seq, '\n'); + } + return 0; +} + +static int iostat_info_seq_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + time64_t now = ktime_get_real_seconds(); + + if (!sbi->iostat_enable) + return 0; + + seq_printf(seq, "time: %-16llu\n", now); + + /* print app IOs */ + seq_printf(seq, "app buffered: %-16llu\n", + sbi->write_iostat[APP_BUFFERED_IO]); + seq_printf(seq, "app direct: %-16llu\n", + sbi->write_iostat[APP_DIRECT_IO]); + seq_printf(seq, "app mapped: %-16llu\n", + sbi->write_iostat[APP_MAPPED_IO]); + + /* print fs IOs */ + seq_printf(seq, "fs data: %-16llu\n", + sbi->write_iostat[FS_DATA_IO]); + seq_printf(seq, "fs node: %-16llu\n", + sbi->write_iostat[FS_NODE_IO]); + seq_printf(seq, "fs meta: %-16llu\n", + sbi->write_iostat[FS_META_IO]); + seq_printf(seq, "fs gc data: %-16llu\n", + sbi->write_iostat[FS_GC_DATA_IO]); + seq_printf(seq, "fs gc node: %-16llu\n", + sbi->write_iostat[FS_GC_NODE_IO]); + seq_printf(seq, "fs cp data: %-16llu\n", + sbi->write_iostat[FS_CP_DATA_IO]); + seq_printf(seq, "fs cp node: %-16llu\n", + sbi->write_iostat[FS_CP_NODE_IO]); + seq_printf(seq, "fs cp meta: %-16llu\n", + sbi->write_iostat[FS_CP_META_IO]); + seq_printf(seq, "fs discard: %-16llu\n", + sbi->write_iostat[FS_DISCARD]); + + return 0; +} + +#define F2FS_PROC_FILE_DEF(_name) \ +static int _name##_open_fs(struct inode *inode, struct file *file) \ +{ \ + return single_open(file, _name##_seq_show, PDE_DATA(inode)); \ +} \ + \ +static const struct file_operations f2fs_seq_##_name##_fops = { \ + .open = _name##_open_fs, \ + .read = seq_read, \ + .llseek = seq_lseek, \ + .release = single_release, \ +}; + +F2FS_PROC_FILE_DEF(segment_info); +F2FS_PROC_FILE_DEF(segment_bits); +F2FS_PROC_FILE_DEF(iostat_info); + +int __init f2fs_init_sysfs(void) +{ + int ret; + + kobject_set_name(&f2fs_kset.kobj, "f2fs"); + f2fs_kset.kobj.parent = fs_kobj; + ret = kset_register(&f2fs_kset); + if (ret) + return ret; + + ret = kobject_init_and_add(&f2fs_feat, &f2fs_feat_ktype, + NULL, "features"); + if (ret) + kset_unregister(&f2fs_kset); + else + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); + return ret; +} + +void f2fs_exit_sysfs(void) +{ + kobject_put(&f2fs_feat); + kset_unregister(&f2fs_kset); + remove_proc_entry("fs/f2fs", NULL); + f2fs_proc_root = NULL; +} + +int f2fs_register_sysfs(struct f2fs_sb_info *sbi) +{ + struct super_block *sb = sbi->sb; + int err; + + sbi->s_kobj.kset = &f2fs_kset; + init_completion(&sbi->s_kobj_unregister); + err = kobject_init_and_add(&sbi->s_kobj, &f2fs_sb_ktype, NULL, + "%s", sb->s_id); + if (err) + return err; + + if (f2fs_proc_root) + sbi->s_proc = proc_mkdir(sb->s_id, f2fs_proc_root); + + if (sbi->s_proc) { + proc_create_data("segment_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_info_fops, sb); + proc_create_data("segment_bits", S_IRUGO, sbi->s_proc, + &f2fs_seq_segment_bits_fops, sb); + proc_create_data("iostat_info", S_IRUGO, sbi->s_proc, + &f2fs_seq_iostat_info_fops, sb); + } + return 0; +} + +void f2fs_unregister_sysfs(struct f2fs_sb_info *sbi) +{ + if (sbi->s_proc) { + remove_proc_entry("iostat_info", sbi->s_proc); + remove_proc_entry("segment_info", sbi->s_proc); + remove_proc_entry("segment_bits", sbi->s_proc); + remove_proc_entry(sbi->sb->s_id, f2fs_proc_root); + } + kobject_del(&sbi->s_kobj); +} diff --git a/fs/f2fs/trace.c b/fs/f2fs/trace.c index 73b4e1d1912a..bccbbf2616d2 100644 --- a/fs/f2fs/trace.c +++ b/fs/f2fs/trace.c @@ -59,7 +59,7 @@ void f2fs_trace_pid(struct page *page) pid_t pid = task_pid_nr(current); void *p; - page->private = pid; + set_page_private(page, (unsigned long)pid); if (radix_tree_preload(GFP_NOFS)) return; @@ -138,7 +138,7 @@ static unsigned int gang_lookup_pids(pid_t *results, unsigned long first_index, radix_tree_for_each_slot(slot, &pids, &iter, first_index) { results[ret] = iter.index; - if (++ret == PIDVEC_SIZE) + if (++ret == max_items) break; } return ret; diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 1c4d5e39586c..ab658419552b 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -264,18 +264,123 @@ static struct f2fs_xattr_entry *__find_xattr(void *base_addr, int index, return entry; } +static struct f2fs_xattr_entry *__find_inline_xattr(void *base_addr, + void **last_addr, int index, + size_t len, const char *name) +{ + struct f2fs_xattr_entry *entry; + unsigned int inline_size = F2FS_INLINE_XATTR_ADDRS << 2; + + list_for_each_xattr(entry, base_addr) { + if ((void *)entry + sizeof(__u32) > base_addr + inline_size || + (void *)XATTR_NEXT_ENTRY(entry) + sizeof(__u32) > + base_addr + inline_size) { + *last_addr = entry; + return NULL; + } + if (entry->e_name_index != index) + continue; + if (entry->e_name_len != len) + continue; + if (!memcmp(entry->e_name, name, len)) + break; + } + return entry; +} + +static int lookup_all_xattrs(struct inode *inode, struct page *ipage, + unsigned int index, unsigned int len, + const char *name, struct f2fs_xattr_entry **xe, + void **base_addr) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + void *cur_addr, *txattr_addr, *last_addr = NULL; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = xnid ? VALID_XATTR_BLOCK_SIZE : 0; + unsigned int inline_size = inline_xattr_size(inode); + int err = 0; + + if (!size && !inline_size) + return -ENODATA; + + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); + if (!txattr_addr) + return -ENOMEM; + + /* read from inline xattr */ + if (inline_size) { + struct page *page = NULL; + void *inline_addr; + + if (ipage) { + inline_addr = inline_xattr_addr(ipage); + } else { + page = get_node_page(sbi, inode->i_ino); + if (IS_ERR(page)) { + err = PTR_ERR(page); + goto out; + } + inline_addr = inline_xattr_addr(page); + } + memcpy(txattr_addr, inline_addr, inline_size); + f2fs_put_page(page, 1); + + *xe = __find_inline_xattr(txattr_addr, &last_addr, + index, len, name); + if (*xe) + goto check; + } + + /* read from xattr node block */ + if (xnid) { + struct page *xpage; + void *xattr_addr; + + /* The inode already has an extended attribute block. */ + xpage = get_node_page(sbi, xnid); + if (IS_ERR(xpage)) { + err = PTR_ERR(xpage); + goto out; + } + + xattr_addr = page_address(xpage); + memcpy(txattr_addr + inline_size, xattr_addr, size); + f2fs_put_page(xpage, 1); + } + + if (last_addr) + cur_addr = XATTR_HDR(last_addr) - 1; + else + cur_addr = txattr_addr; + + *xe = __find_xattr(cur_addr, index, len, name); +check: + if (IS_XATTR_LAST_ENTRY(*xe)) { + err = -ENODATA; + goto out; + } + + *base_addr = txattr_addr; + return 0; +out: + kzfree(txattr_addr); + return err; +} + static int read_all_xattrs(struct inode *inode, struct page *ipage, void **base_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_header *header; - size_t size = PAGE_SIZE, inline_size = 0; + nid_t xnid = F2FS_I(inode)->i_xattr_nid; + unsigned int size = VALID_XATTR_BLOCK_SIZE; + unsigned int inline_size = inline_xattr_size(inode); void *txattr_addr; int err; - inline_size = inline_xattr_size(inode); - - txattr_addr = kzalloc(inline_size + size, GFP_F2FS_ZERO); + txattr_addr = kzalloc(inline_size + size + XATTR_PADDING_SIZE, + GFP_F2FS_ZERO); if (!txattr_addr) return -ENOMEM; @@ -299,19 +404,19 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, } /* read from xattr node block */ - if (F2FS_I(inode)->i_xattr_nid) { + if (xnid) { struct page *xpage; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = get_node_page(sbi, xnid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); goto fail; } xattr_addr = page_address(xpage); - memcpy(txattr_addr + inline_size, xattr_addr, PAGE_SIZE); + memcpy(txattr_addr + inline_size, xattr_addr, size); f2fs_put_page(xpage, 1); } @@ -333,14 +438,12 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, void *txattr_addr, struct page *ipage) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - size_t inline_size = 0; + size_t inline_size = inline_xattr_size(inode); void *xattr_addr; struct page *xpage; nid_t new_nid = 0; int err; - inline_size = inline_xattr_size(inode); - if (hsize > inline_size && !F2FS_I(inode)->i_xattr_nid) if (!alloc_nid(sbi, &new_nid)) return -ENOSPC; @@ -386,7 +489,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = new_node_page(&dn, XATTR_NODE_OFFSET, ipage); + xpage = new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { alloc_nid_failed(sbi, new_nid); return PTR_ERR(xpage); @@ -395,23 +498,20 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, } xattr_addr = page_address(xpage); - memcpy(xattr_addr, txattr_addr + inline_size, PAGE_SIZE - - sizeof(struct node_footer)); + memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); set_page_dirty(xpage); f2fs_put_page(xpage, 1); - /* need to checkpoint during fsync */ - F2FS_I(inode)->xattr_ver = cur_cp_version(F2FS_CKPT(sbi)); return 0; } int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, size_t buffer_size, struct page *ipage) { - struct f2fs_xattr_entry *entry; - void *base_addr; + struct f2fs_xattr_entry *entry = NULL; int error = 0; - size_t size, len; + unsigned int size, len; + void *base_addr = NULL; if (name == NULL) return -EINVAL; @@ -420,21 +520,18 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - error = read_all_xattrs(inode, ipage, &base_addr); + down_read(&F2FS_I(inode)->i_xattr_sem); + error = lookup_all_xattrs(inode, ipage, index, len, name, + &entry, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; - entry = __find_xattr(base_addr, index, len, name); - if (IS_XATTR_LAST_ENTRY(entry)) { - error = -ENODATA; - goto cleanup; - } - size = le16_to_cpu(entry->e_value_size); if (buffer && size > buffer_size) { error = -ERANGE; - goto cleanup; + goto out; } if (buffer) { @@ -442,8 +539,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, memcpy(buffer, pval, size); } error = size; - -cleanup: +out: kzfree(base_addr); return error; } @@ -456,7 +552,9 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) int error = 0; size_t rest = buffer_size; + down_read(&F2FS_I(inode)->i_xattr_sem); error = read_all_xattrs(inode, NULL, &base_addr); + up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -485,6 +583,15 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size) return error; } +static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, + const void *value, size_t size) +{ + void *pval = entry->e_name + entry->e_name_len; + + return (le16_to_cpu(entry->e_value_size) == size) && + !memcmp(pval, value, size); +} + static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, struct page *ipage, int flags) @@ -519,12 +626,17 @@ static int __f2fs_setxattr(struct inode *inode, int index, found = IS_XATTR_LAST_ENTRY(here) ? 0 : 1; - if ((flags & XATTR_REPLACE) && !found) { + if (found) { + if ((flags & XATTR_CREATE)) { + error = -EEXIST; + goto exit; + } + + if (f2fs_xattr_value_same(here, value, size)) + goto exit; + } else if ((flags & XATTR_REPLACE)) { error = -ENODATA; goto exit; - } else if ((flags & XATTR_CREATE) && found) { - error = -EEXIST; - goto exit; } last = here; @@ -618,7 +730,9 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, f2fs_lock_op(sbi); /* protect xattr_ver */ down_write(&F2FS_I(inode)->i_sem); + down_write(&F2FS_I(inode)->i_xattr_sem); err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + up_write(&F2FS_I(inode)->i_xattr_sem); up_write(&F2FS_I(inode)->i_sem); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index d2fd0387a3c7..08a4840d6d7d 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -58,10 +58,10 @@ struct f2fs_xattr_entry { #define XATTR_FIRST_ENTRY(ptr) (XATTR_ENTRY(XATTR_HDR(ptr) + 1)) #define XATTR_ROUND (3) -#define XATTR_ALIGN(size) ((size + XATTR_ROUND) & ~XATTR_ROUND) +#define XATTR_ALIGN(size) (((size) + XATTR_ROUND) & ~XATTR_ROUND) #define ENTRY_SIZE(entry) (XATTR_ALIGN(sizeof(struct f2fs_xattr_entry) + \ - entry->e_name_len + le16_to_cpu(entry->e_value_size))) + (entry)->e_name_len + le16_to_cpu((entry)->e_value_size))) #define XATTR_NEXT_ENTRY(entry) ((struct f2fs_xattr_entry *)((char *)(entry) +\ ENTRY_SIZE(entry))) @@ -72,9 +72,10 @@ struct f2fs_xattr_entry { for (entry = XATTR_FIRST_ENTRY(addr);\ !IS_XATTR_LAST_ENTRY(entry);\ entry = XATTR_NEXT_ENTRY(entry)) - -#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + PAGE_SIZE - \ - sizeof(struct node_footer) - sizeof(__u32)) +#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer)) +#define XATTR_PADDING_SIZE (sizeof(__u32)) +#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \ + VALID_XATTR_BLOCK_SIZE) #define MAX_VALUE_LEN(i) (MIN_OFFSET(i) - \ sizeof(struct f2fs_xattr_header) - \ diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h index cf54a312993f..c2a975e4a711 100644 --- a/include/linux/f2fs_fs.h +++ b/include/linux/f2fs_fs.h @@ -114,6 +114,8 @@ struct f2fs_super_block { /* * For checkpoint */ +#define CP_TRIMMED_FLAG 0x00000100 +#define CP_NAT_BITS_FLAG 0x00000080 #define CP_CRC_RECOVERY_FLAG 0x00000040 #define CP_FASTBOOT_FLAG 0x00000020 #define CP_FSCK_FLAG 0x00000010 @@ -184,6 +186,8 @@ struct f2fs_extent { #define F2FS_NAME_LEN 255 #define F2FS_INLINE_XATTR_ADDRS 50 /* 200 bytes for inline xattrs */ #define DEF_ADDRS_PER_INODE 923 /* Address Pointers in an Inode */ +#define CUR_ADDRS_PER_INODE(inode) (DEF_ADDRS_PER_INODE - \ + get_extra_isize(inode)) #define DEF_NIDS_PER_INODE 5 /* Node IDs in an Inode */ #define ADDRS_PER_INODE(inode) addrs_per_inode(inode) #define ADDRS_PER_BLOCK 1018 /* Address Pointers in a Direct Block */ @@ -203,9 +207,7 @@ struct f2fs_extent { #define F2FS_INLINE_DENTRY 0x04 /* file inline dentry flag */ #define F2FS_DATA_EXIST 0x08 /* file inline data exist flag */ #define F2FS_INLINE_DOTS 0x10 /* file having implicit dot dentries */ - -#define MAX_INLINE_DATA (sizeof(__le32) * (DEF_ADDRS_PER_INODE - \ - F2FS_INLINE_XATTR_ADDRS - 1)) +#define F2FS_EXTRA_ATTR 0x20 /* file having extra attribute */ struct f2fs_inode { __le16 i_mode; /* file mode */ @@ -233,8 +235,16 @@ struct f2fs_inode { struct f2fs_extent i_ext; /* caching a largest extent */ - __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ - + union { + struct { + __le16 i_extra_isize; /* extra inode attribute size */ + __le16 i_padding; /* padding */ + __le32 i_projid; /* project id */ + __le32 i_inode_checksum;/* inode meta checksum */ + __le32 i_extra_end[0]; /* for attribute size calculation */ + }; + __le32 i_addr[DEF_ADDRS_PER_INODE]; /* Pointers to data blocks */ + }; __le32 i_nid[DEF_NIDS_PER_INODE]; /* direct(2), indirect(2), double_indirect(1) node id */ } __packed; @@ -278,6 +288,7 @@ struct f2fs_node { * For NAT entries */ #define NAT_ENTRY_PER_BLOCK (PAGE_SIZE / sizeof(struct f2fs_nat_entry)) +#define NAT_ENTRY_BITMAP_SIZE ((NAT_ENTRY_PER_BLOCK + 7) / 8) struct f2fs_nat_entry { __u8 version; /* latest version of cached nat entry */ @@ -462,7 +473,7 @@ typedef __le32 f2fs_hash_t; #define MAX_DIR_BUCKETS (1 << ((MAX_DIR_HASH_DEPTH / 2) - 1)) /* - * space utilization of regular dentry and inline dentry + * space utilization of regular dentry and inline dentry (w/o extra reservation) * regular dentry inline dentry * bitmap 1 * 27 = 27 1 * 23 = 23 * reserved 1 * 3 = 3 1 * 7 = 7 @@ -498,24 +509,6 @@ struct f2fs_dentry_block { __u8 filename[NR_DENTRY_IN_BLOCK][F2FS_SLOT_LEN]; } __packed; -/* for inline dir */ -#define NR_INLINE_DENTRY (MAX_INLINE_DATA * BITS_PER_BYTE / \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - BITS_PER_BYTE + 1)) -#define INLINE_DENTRY_BITMAP_SIZE ((NR_INLINE_DENTRY + \ - BITS_PER_BYTE - 1) / BITS_PER_BYTE) -#define INLINE_RESERVED_SIZE (MAX_INLINE_DATA - \ - ((SIZE_OF_DIR_ENTRY + F2FS_SLOT_LEN) * \ - NR_INLINE_DENTRY + INLINE_DENTRY_BITMAP_SIZE)) - -/* inline directory entry structure */ -struct f2fs_inline_dentry { - __u8 dentry_bitmap[INLINE_DENTRY_BITMAP_SIZE]; - __u8 reserved[INLINE_RESERVED_SIZE]; - struct f2fs_dir_entry dentry[NR_INLINE_DENTRY]; - __u8 filename[NR_INLINE_DENTRY][F2FS_SLOT_LEN]; -} __packed; - /* file types used in inode_info->flags */ enum { F2FS_FT_UNKNOWN, @@ -531,4 +524,6 @@ enum { #define S_SHIFT 12 +#define F2FS_DEF_PROJID 0 /* default project ID */ + #endif /* _LINUX_F2FS_FS_H */ diff --git a/include/linux/fscrypt_common.h b/include/linux/fscrypt_common.h new file mode 100644 index 000000000000..4022c61f7e9b --- /dev/null +++ b/include/linux/fscrypt_common.h @@ -0,0 +1,138 @@ +/* + * fscrypt_common.h: common declarations for per-file encryption + * + * Copyright (C) 2015, Google, Inc. + * + * Written by Michael Halcrow, 2015. + * Modified by Jaegeuk Kim, 2015. + */ + +#ifndef _LINUX_FSCRYPT_COMMON_H +#define _LINUX_FSCRYPT_COMMON_H + +#include +#include +#include +#include +#include +#include +#include + +#define FS_CRYPTO_BLOCK_SIZE 16 + +struct fscrypt_info; + +struct fscrypt_ctx { + union { + struct { + struct page *bounce_page; /* Ciphertext page */ + struct page *control_page; /* Original page */ + } w; + struct { + struct bio *bio; + struct work_struct work; + } r; + struct list_head free_list; /* Free list */ + }; + u8 flags; /* Flags */ +}; + +/** + * For encrypted symlinks, the ciphertext length is stored at the beginning + * of the string in little-endian format. + */ +struct fscrypt_symlink_data { + __le16 len; + char encrypted_path[1]; +} __packed; + +struct fscrypt_str { + unsigned char *name; + u32 len; +}; + +struct fscrypt_name { + const struct qstr *usr_fname; + struct fscrypt_str disk_name; + u32 hash; + u32 minor_hash; + struct fscrypt_str crypto_buf; +}; + +#define FSTR_INIT(n, l) { .name = n, .len = l } +#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) +#define fname_name(p) ((p)->disk_name.name) +#define fname_len(p) ((p)->disk_name.len) + +/* + * fscrypt superblock flags + */ +#define FS_CFLG_OWN_PAGES (1U << 1) + +/* + * crypto opertions for filesystems + */ +struct fscrypt_operations { + unsigned int flags; + const char *key_prefix; + int (*get_context)(struct inode *, void *, size_t); + int (*set_context)(struct inode *, const void *, size_t, void *); + int (*dummy_context)(struct inode *); + bool (*is_encrypted)(struct inode *); + bool (*empty_dir)(struct inode *); + unsigned (*max_namelen)(struct inode *); +}; + +static inline bool fscrypt_dummy_context_enabled(struct inode *inode) +{ + if (inode->i_sb->s_cop->dummy_context && + inode->i_sb->s_cop->dummy_context(inode)) + return true; + return false; +} + +static inline bool fscrypt_valid_enc_modes(u32 contents_mode, + u32 filenames_mode) +{ + if (contents_mode == FS_ENCRYPTION_MODE_AES_128_CBC && + filenames_mode == FS_ENCRYPTION_MODE_AES_128_CTS) + return true; + + if (contents_mode == FS_ENCRYPTION_MODE_AES_256_XTS && + filenames_mode == FS_ENCRYPTION_MODE_AES_256_CTS) + return true; + + return false; +} + +static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) +{ + if (str->len == 1 && str->name[0] == '.') + return true; + + if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') + return true; + + return false; +} + +static inline struct page *fscrypt_control_page(struct page *page) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return ((struct fscrypt_ctx *)page_private(page))->w.control_page; +#else + WARN_ON_ONCE(1); + return ERR_PTR(-EINVAL); +#endif +} + +static inline int fscrypt_has_encryption_key(const struct inode *inode) +{ +#if IS_ENABLED(CONFIG_FS_ENCRYPTION) + return (inode->i_crypt_info != NULL); +#else + return 0; +#endif +} + +#endif /* _LINUX_FSCRYPT_COMMON_H */ diff --git a/include/linux/fscrypt_notsupp.h b/include/linux/fscrypt_notsupp.h new file mode 100644 index 000000000000..ec406aed2f2f --- /dev/null +++ b/include/linux/fscrypt_notsupp.h @@ -0,0 +1,177 @@ +/* + * fscrypt_notsupp.h + * + * This stubs out the fscrypt functions for filesystems configured without + * encryption support. + */ + +#ifndef _LINUX_FSCRYPT_NOTSUPP_H +#define _LINUX_FSCRYPT_NOTSUPP_H + +#include + +/* crypto.c */ +static inline struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *inode, + gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline void fscrypt_release_ctx(struct fscrypt_ctx *ctx) +{ + return; +} + +static inline struct page *fscrypt_encrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, + unsigned int offs, + u64 lblk_num, gfp_t gfp_flags) +{ + return ERR_PTR(-EOPNOTSUPP); +} + +static inline int fscrypt_decrypt_page(const struct inode *inode, + struct page *page, + unsigned int len, unsigned int offs, + u64 lblk_num) +{ + return -EOPNOTSUPP; +} + + +static inline void fscrypt_restore_control_page(struct page *page) +{ + return; +} + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + return; +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + return; +} + +/* policy.c */ +static inline int fscrypt_ioctl_set_policy(struct file *filp, + const void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_ioctl_get_policy(struct file *filp, void __user *arg) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_has_permitted_context(struct inode *parent, + struct inode *child) +{ + return 0; +} + +static inline int fscrypt_inherit_context(struct inode *parent, + struct inode *child, + void *fs_data, bool preload) +{ + return -EOPNOTSUPP; +} + +/* keyinfo.c */ +static inline int fscrypt_get_encryption_info(struct inode *inode) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_put_encryption_info(struct inode *inode, + struct fscrypt_info *ci) +{ + return; +} + + /* fname.c */ +static inline int fscrypt_setup_filename(struct inode *dir, + const struct qstr *iname, + int lookup, struct fscrypt_name *fname) +{ + if (dir->i_sb->s_cop->is_encrypted(dir)) + return -EOPNOTSUPP; + + memset(fname, 0, sizeof(struct fscrypt_name)); + fname->usr_fname = iname; + fname->disk_name.name = (unsigned char *)iname->name; + fname->disk_name.len = iname->len; + return 0; +} + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + return; +} + +static inline u32 fscrypt_fname_encrypted_size(const struct inode *inode, + u32 ilen) +{ + /* never happens */ + WARN_ON(1); + return 0; +} + +static inline int fscrypt_fname_alloc_buffer(const struct inode *inode, + u32 ilen, + struct fscrypt_str *crypto_str) +{ + return -EOPNOTSUPP; +} + +static inline void fscrypt_fname_free_buffer(struct fscrypt_str *crypto_str) +{ + return; +} + +static inline int fscrypt_fname_disk_to_usr(struct inode *inode, + u32 hash, u32 minor_hash, + const struct fscrypt_str *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline int fscrypt_fname_usr_to_disk(struct inode *inode, + const struct qstr *iname, + struct fscrypt_str *oname) +{ + return -EOPNOTSUPP; +} + +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + /* Encryption support disabled; use standard comparison */ + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* bio.c */ +static inline void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *ctx, + struct bio *bio) +{ + return; +} + +static inline void fscrypt_pullback_bio_page(struct page **page, bool restore) +{ + return; +} + +static inline int fscrypt_zeroout_range(const struct inode *inode, pgoff_t lblk, + sector_t pblk, unsigned int len) +{ + return -EOPNOTSUPP; +} + +#endif /* _LINUX_FSCRYPT_NOTSUPP_H */ diff --git a/include/linux/fscrypt_supp.h b/include/linux/fscrypt_supp.h new file mode 100644 index 000000000000..32e2fcf13b01 --- /dev/null +++ b/include/linux/fscrypt_supp.h @@ -0,0 +1,145 @@ +/* + * fscrypt_supp.h + * + * This is included by filesystems configured with encryption support. + */ + +#ifndef _LINUX_FSCRYPT_SUPP_H +#define _LINUX_FSCRYPT_SUPP_H + +#include + +/* crypto.c */ +extern struct kmem_cache *fscrypt_info_cachep; +extern struct fscrypt_ctx *fscrypt_get_ctx(const struct inode *, gfp_t); +extern void fscrypt_release_ctx(struct fscrypt_ctx *); +extern struct page *fscrypt_encrypt_page(const struct inode *, struct page *, + unsigned int, unsigned int, + u64, gfp_t); +extern int fscrypt_decrypt_page(const struct inode *, struct page *, unsigned int, + unsigned int, u64); +extern void fscrypt_restore_control_page(struct page *); + +extern const struct dentry_operations fscrypt_d_ops; + +static inline void fscrypt_set_d_op(struct dentry *dentry) +{ + d_set_d_op(dentry, &fscrypt_d_ops); +} + +static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; + spin_unlock(&dentry->d_lock); +} + +/* policy.c */ +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); +extern int fscrypt_has_permitted_context(struct inode *, struct inode *); +extern int fscrypt_inherit_context(struct inode *, struct inode *, + void *, bool); +/* keyinfo.c */ +extern int fscrypt_get_encryption_info(struct inode *); +extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); + +/* fname.c */ +extern int fscrypt_setup_filename(struct inode *, const struct qstr *, + int lookup, struct fscrypt_name *); + +static inline void fscrypt_free_filename(struct fscrypt_name *fname) +{ + kfree(fname->crypto_buf.name); +} + +extern u32 fscrypt_fname_encrypted_size(const struct inode *, u32); +extern int fscrypt_fname_alloc_buffer(const struct inode *, u32, + struct fscrypt_str *); +extern void fscrypt_fname_free_buffer(struct fscrypt_str *); +extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, + const struct fscrypt_str *, struct fscrypt_str *); +extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, + struct fscrypt_str *); + +#define FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE 32 + +/* Extracts the second-to-last ciphertext block; see explanation below */ +#define FSCRYPT_FNAME_DIGEST(name, len) \ + ((name) + round_down((len) - FS_CRYPTO_BLOCK_SIZE - 1, \ + FS_CRYPTO_BLOCK_SIZE)) + +#define FSCRYPT_FNAME_DIGEST_SIZE FS_CRYPTO_BLOCK_SIZE + +/** + * fscrypt_digested_name - alternate identifier for an on-disk filename + * + * When userspace lists an encrypted directory without access to the key, + * filenames whose ciphertext is longer than FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE + * bytes are shown in this abbreviated form (base64-encoded) rather than as the + * full ciphertext (base64-encoded). This is necessary to allow supporting + * filenames up to NAME_MAX bytes, since base64 encoding expands the length. + * + * To make it possible for filesystems to still find the correct directory entry + * despite not knowing the full on-disk name, we encode any filesystem-specific + * 'hash' and/or 'minor_hash' which the filesystem may need for its lookups, + * followed by the second-to-last ciphertext block of the filename. Due to the + * use of the CBC-CTS encryption mode, the second-to-last ciphertext block + * depends on the full plaintext. (Note that ciphertext stealing causes the + * last two blocks to appear "flipped".) This makes accidental collisions very + * unlikely: just a 1 in 2^128 chance for two filenames to collide even if they + * share the same filesystem-specific hashes. + * + * However, this scheme isn't immune to intentional collisions, which can be + * created by anyone able to create arbitrary plaintext filenames and view them + * without the key. Making the "digest" be a real cryptographic hash like + * SHA-256 over the full ciphertext would prevent this, although it would be + * less efficient and harder to implement, especially since the filesystem would + * need to calculate it for each directory entry examined during a search. + */ +struct fscrypt_digested_name { + u32 hash; + u32 minor_hash; + u8 digest[FSCRYPT_FNAME_DIGEST_SIZE]; +}; + +/** + * fscrypt_match_name() - test whether the given name matches a directory entry + * @fname: the name being searched for + * @de_name: the name from the directory entry + * @de_name_len: the length of @de_name in bytes + * + * Normally @fname->disk_name will be set, and in that case we simply compare + * that to the name stored in the directory entry. The only exception is that + * if we don't have the key for an encrypted directory and a filename in it is + * very long, then we won't have the full disk_name and we'll instead need to + * match against the fscrypt_digested_name. + * + * Return: %true if the name matches, otherwise %false. + */ +static inline bool fscrypt_match_name(const struct fscrypt_name *fname, + const u8 *de_name, u32 de_name_len) +{ + if (unlikely(!fname->disk_name.name)) { + const struct fscrypt_digested_name *n = + (const void *)fname->crypto_buf.name; + if (WARN_ON_ONCE(fname->usr_fname->name[0] != '_')) + return false; + if (de_name_len <= FSCRYPT_FNAME_MAX_UNDIGESTED_SIZE) + return false; + return !memcmp(FSCRYPT_FNAME_DIGEST(de_name, de_name_len), + n->digest, FSCRYPT_FNAME_DIGEST_SIZE); + } + + if (de_name_len != fname->disk_name.len) + return false; + return !memcmp(de_name, fname->disk_name.name, fname->disk_name.len); +} + +/* bio.c */ +extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); +extern void fscrypt_pullback_bio_page(struct page **, bool); +extern int fscrypt_zeroout_range(const struct inode *, pgoff_t, sector_t, + unsigned int); + +#endif /* _LINUX_FSCRYPT_SUPP_H */ diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h index ff8b11b26f31..e6e53a36104b 100644 --- a/include/linux/fscrypto.h +++ b/include/linux/fscrypto.h @@ -250,8 +250,8 @@ extern void fscrypt_restore_control_page(struct page *); extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, unsigned int); /* policy.c */ -extern int fscrypt_process_policy(struct file *, const struct fscrypt_policy *); -extern int fscrypt_get_policy(struct inode *, struct fscrypt_policy *); +extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); +extern int fscrypt_ioctl_get_policy(struct file *, void __user *); extern int fscrypt_has_permitted_context(struct inode *, struct inode *); extern int fscrypt_inherit_context(struct inode *, struct inode *, void *, bool); @@ -320,14 +320,14 @@ static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, } /* policy.c */ -static inline int fscrypt_notsupp_process_policy(struct file *f, - const struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, + const void __user *arg) { return -EOPNOTSUPP; } -static inline int fscrypt_notsupp_get_policy(struct inode *i, - struct fscrypt_policy *p) +static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, + void __user *arg) { return -EOPNOTSUPP; } diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h index 217691582dd4..7063bbcca03b 100644 --- a/include/trace/events/f2fs.h +++ b/include/trace/events/f2fs.h @@ -6,8 +6,8 @@ #include -#define show_dev(entry) MAJOR(entry->dev), MINOR(entry->dev) -#define show_dev_ino(entry) show_dev(entry), (unsigned long)entry->ino +#define show_dev(dev) MAJOR(dev), MINOR(dev) +#define show_dev_ino(entry) show_dev(entry->dev), (unsigned long)entry->ino TRACE_DEFINE_ENUM(NODE); TRACE_DEFINE_ENUM(DATA); @@ -15,6 +15,7 @@ TRACE_DEFINE_ENUM(META); TRACE_DEFINE_ENUM(META_FLUSH); TRACE_DEFINE_ENUM(INMEM); TRACE_DEFINE_ENUM(INMEM_DROP); +TRACE_DEFINE_ENUM(INMEM_INVALIDATE); TRACE_DEFINE_ENUM(IPU); TRACE_DEFINE_ENUM(OPU); TRACE_DEFINE_ENUM(CURSEG_HOT_DATA); @@ -43,6 +44,7 @@ TRACE_DEFINE_ENUM(CP_FASTBOOT); TRACE_DEFINE_ENUM(CP_SYNC); TRACE_DEFINE_ENUM(CP_RECOVERY); TRACE_DEFINE_ENUM(CP_DISCARD); +TRACE_DEFINE_ENUM(CP_TRIMMED); #define show_block_type(type) \ __print_symbolic(type, \ @@ -52,6 +54,7 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { META_FLUSH, "META_FLUSH" }, \ { INMEM, "INMEM" }, \ { INMEM_DROP, "INMEM_DROP" }, \ + { INMEM_INVALIDATE, "INMEM_INVALIDATE" }, \ { INMEM_REVOKE, "INMEM_REVOKE" }, \ { IPU, "IN-PLACE" }, \ { OPU, "OUT-OF-PLACE" }) @@ -80,6 +83,12 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { REQ_META | REQ_PRIO, "(MP)" }, \ { 0, " \b" }) +#define show_block_temp(temp) \ + __print_symbolic(temp, \ + { HOT, "HOT" }, \ + { WARM, "WARM" }, \ + { COLD, "COLD" }) + #define show_data_type(type) \ __print_symbolic(type, \ { CURSEG_HOT_DATA, "Hot DATA" }, \ @@ -116,7 +125,8 @@ TRACE_DEFINE_ENUM(CP_DISCARD); { CP_FASTBOOT, "Fastboot" }, \ { CP_SYNC, "Sync" }, \ { CP_RECOVERY, "Recovery" }, \ - { CP_DISCARD, "Discard" }) + { CP_DISCARD, "Discard" }, \ + { CP_UMOUNT | CP_TRIMMED, "Umount,Trimmed" }) struct victim_sel_policy; struct f2fs_map_blocks; @@ -239,7 +249,7 @@ TRACE_EVENT(f2fs_sync_fs, ), TP_printk("dev = (%d,%d), superblock is %s, wait = %d", - show_dev(__entry), + show_dev(__entry->dev), __entry->dirty ? "dirty" : "not dirty", __entry->wait) ); @@ -309,6 +319,13 @@ DEFINE_EVENT(f2fs__inode_exit, f2fs_unlink_exit, TP_ARGS(inode, ret) ); +DEFINE_EVENT(f2fs__inode_exit, f2fs_drop_inode, + + TP_PROTO(struct inode *inode, int ret), + + TP_ARGS(inode, ret) +); + DEFINE_EVENT(f2fs__inode, f2fs_truncate, TP_PROTO(struct inode *inode), @@ -518,14 +535,14 @@ TRACE_EVENT(f2fs_map_blocks, TRACE_EVENT(f2fs_background_gc, - TP_PROTO(struct super_block *sb, long wait_ms, + TP_PROTO(struct super_block *sb, unsigned int wait_ms, unsigned int prefree, unsigned int free), TP_ARGS(sb, wait_ms, prefree, free), TP_STRUCT__entry( __field(dev_t, dev) - __field(long, wait_ms) + __field(unsigned int, wait_ms) __field(unsigned int, prefree) __field(unsigned int, free) ), @@ -537,13 +554,120 @@ TRACE_EVENT(f2fs_background_gc, __entry->free = free; ), - TP_printk("dev = (%d,%d), wait_ms = %ld, prefree = %u, free = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d), wait_ms = %u, prefree = %u, free = %u", + show_dev(__entry->dev), __entry->wait_ms, __entry->prefree, __entry->free) ); +TRACE_EVENT(f2fs_gc_begin, + + TP_PROTO(struct super_block *sb, bool sync, bool background, + long long dirty_nodes, long long dirty_dents, + long long dirty_imeta, unsigned int free_sec, + unsigned int free_seg, int reserved_seg, + unsigned int prefree_seg), + + TP_ARGS(sb, sync, background, dirty_nodes, dirty_dents, dirty_imeta, + free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(bool, sync) + __field(bool, background) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->sync = sync; + __entry->background = background; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), sync = %d, background = %d, nodes = %lld, " + "dents = %lld, imeta = %lld, free_sec:%u, free_seg:%u, " + "rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->sync, + __entry->background, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + +TRACE_EVENT(f2fs_gc_end, + + TP_PROTO(struct super_block *sb, int ret, int seg_freed, + int sec_freed, long long dirty_nodes, + long long dirty_dents, long long dirty_imeta, + unsigned int free_sec, unsigned int free_seg, + int reserved_seg, unsigned int prefree_seg), + + TP_ARGS(sb, ret, seg_freed, sec_freed, dirty_nodes, dirty_dents, + dirty_imeta, free_sec, free_seg, reserved_seg, prefree_seg), + + TP_STRUCT__entry( + __field(dev_t, dev) + __field(int, ret) + __field(int, seg_freed) + __field(int, sec_freed) + __field(long long, dirty_nodes) + __field(long long, dirty_dents) + __field(long long, dirty_imeta) + __field(unsigned int, free_sec) + __field(unsigned int, free_seg) + __field(int, reserved_seg) + __field(unsigned int, prefree_seg) + ), + + TP_fast_assign( + __entry->dev = sb->s_dev; + __entry->ret = ret; + __entry->seg_freed = seg_freed; + __entry->sec_freed = sec_freed; + __entry->dirty_nodes = dirty_nodes; + __entry->dirty_dents = dirty_dents; + __entry->dirty_imeta = dirty_imeta; + __entry->free_sec = free_sec; + __entry->free_seg = free_seg; + __entry->reserved_seg = reserved_seg; + __entry->prefree_seg = prefree_seg; + ), + + TP_printk("dev = (%d,%d), ret = %d, seg_freed = %d, sec_freed = %d, " + "nodes = %lld, dents = %lld, imeta = %lld, free_sec:%u, " + "free_seg:%u, rsv_seg:%d, prefree_seg:%u", + show_dev(__entry->dev), + __entry->ret, + __entry->seg_freed, + __entry->sec_freed, + __entry->dirty_nodes, + __entry->dirty_dents, + __entry->dirty_imeta, + __entry->free_sec, + __entry->free_seg, + __entry->reserved_seg, + __entry->prefree_seg) +); + TRACE_EVENT(f2fs_get_victim, TP_PROTO(struct super_block *sb, int type, int gc_type, @@ -580,7 +704,7 @@ TRACE_EVENT(f2fs_get_victim, TP_printk("dev = (%d,%d), type = %s, policy = (%s, %s, %s), victim = %u " "ofs_unit = %u, pre_victim_secno = %d, prefree = %u, free = %u", - show_dev(__entry), + show_dev(__entry->dev), show_data_type(__entry->type), show_gc_type(__entry->gc_type), show_alloc_mode(__entry->alloc_mode), @@ -717,7 +841,7 @@ TRACE_EVENT(f2fs_reserve_new_blocks, ), TP_printk("dev = (%d,%d), nid = %u, ofs_in_node = %u, count = %llu", - show_dev(__entry), + show_dev(__entry->dev), (unsigned int)__entry->nid, __entry->ofs_in_node, (unsigned long long)__entry->count) @@ -737,6 +861,7 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __field(block_t, new_blkaddr) __field(int, op) __field(int, op_flags) + __field(int, temp) __field(int, type) ), @@ -748,16 +873,18 @@ DECLARE_EVENT_CLASS(f2fs__submit_page_bio, __entry->new_blkaddr = fio->new_blkaddr; __entry->op = fio->op; __entry->op_flags = fio->op_flags; + __entry->temp = fio->temp; __entry->type = fio->type; ), TP_printk("dev = (%d,%d), ino = %lu, page_index = 0x%lx, " - "oldaddr = 0x%llx, newaddr = 0x%llx rw = %s%s, type = %s", + "oldaddr = 0x%llx, newaddr = 0x%llx, rw = %s(%s), type = %s_%s", show_dev_ino(__entry), (unsigned long)__entry->index, (unsigned long long)__entry->old_blkaddr, (unsigned long long)__entry->new_blkaddr, show_bio_type(__entry->op, __entry->op_flags), + show_block_temp(__entry->temp), show_block_type(__entry->type)) ); @@ -770,7 +897,7 @@ DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_bio, TP_CONDITION(page->mapping) ); -DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_mbio, +DEFINE_EVENT_CONDITION(f2fs__submit_page_bio, f2fs_submit_page_write, TP_PROTO(struct page *page, struct f2fs_io_info *fio), @@ -787,6 +914,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_STRUCT__entry( __field(dev_t, dev) + __field(dev_t, target) __field(int, op) __field(int, op_flags) __field(int, type) @@ -796,6 +924,7 @@ DECLARE_EVENT_CLASS(f2fs__bio, TP_fast_assign( __entry->dev = sb->s_dev; + __entry->target = bio->bi_bdev->bd_dev; __entry->op = bio_op(bio); __entry->op_flags = bio->bi_rw; __entry->type = type; @@ -803,8 +932,9 @@ DECLARE_EVENT_CLASS(f2fs__bio, __entry->size = bio->bi_iter.bi_size; ), - TP_printk("dev = (%d,%d), %s%s, %s, sector = %lld, size = %u", - show_dev(__entry), + TP_printk("dev = (%d,%d)/(%d,%d), rw = %s%s, %s, sector = %lld, size = %u", + show_dev(__entry->target), + show_dev(__entry->dev), show_bio_type(__entry->op, __entry->op_flags), show_block_type(__entry->type), (unsigned long long)__entry->sector, @@ -1101,16 +1231,16 @@ TRACE_EVENT(f2fs_write_checkpoint, ), TP_printk("dev = (%d,%d), checkpoint for %s, state = %s", - show_dev(__entry), + show_dev(__entry->dev), show_cpreason(__entry->reason), __entry->msg) ); -TRACE_EVENT(f2fs_issue_discard, +DECLARE_EVENT_CLASS(f2fs_discard, - TP_PROTO(struct super_block *sb, block_t blkstart, block_t blklen), + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), - TP_ARGS(sb, blkstart, blklen), + TP_ARGS(dev, blkstart, blklen), TP_STRUCT__entry( __field(dev_t, dev) @@ -1119,22 +1249,36 @@ TRACE_EVENT(f2fs_issue_discard, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; __entry->blklen = blklen; ), TP_printk("dev = (%d,%d), blkstart = 0x%llx, blklen = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart, (unsigned long long)__entry->blklen) ); +DEFINE_EVENT(f2fs_discard, f2fs_queue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + +DEFINE_EVENT(f2fs_discard, f2fs_issue_discard, + + TP_PROTO(struct block_device *dev, block_t blkstart, block_t blklen), + + TP_ARGS(dev, blkstart, blklen) +); + TRACE_EVENT(f2fs_issue_reset_zone, - TP_PROTO(struct super_block *sb, block_t blkstart), + TP_PROTO(struct block_device *dev, block_t blkstart), - TP_ARGS(sb, blkstart), + TP_ARGS(dev, blkstart), TP_STRUCT__entry( __field(dev_t, dev) @@ -1142,38 +1286,41 @@ TRACE_EVENT(f2fs_issue_reset_zone, ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->blkstart = blkstart; ), TP_printk("dev = (%d,%d), reset zone at block = 0x%llx", - show_dev(__entry), + show_dev(__entry->dev), (unsigned long long)__entry->blkstart) ); TRACE_EVENT(f2fs_issue_flush, - TP_PROTO(struct super_block *sb, unsigned int nobarrier, - unsigned int flush_merge), + TP_PROTO(struct block_device *dev, unsigned int nobarrier, + unsigned int flush_merge, int ret), - TP_ARGS(sb, nobarrier, flush_merge), + TP_ARGS(dev, nobarrier, flush_merge, ret), TP_STRUCT__entry( __field(dev_t, dev) __field(unsigned int, nobarrier) __field(unsigned int, flush_merge) + __field(int, ret) ), TP_fast_assign( - __entry->dev = sb->s_dev; + __entry->dev = dev->bd_dev; __entry->nobarrier = nobarrier; __entry->flush_merge = flush_merge; + __entry->ret = ret; ), - TP_printk("dev = (%d,%d), %s %s", - show_dev(__entry), + TP_printk("dev = (%d,%d), %s %s, ret = %d", + show_dev(__entry->dev), __entry->nobarrier ? "skip (nobarrier)" : "issue", - __entry->flush_merge ? " with flush_merge" : "") + __entry->flush_merge ? " with flush_merge" : "", + __entry->ret) ); TRACE_EVENT(f2fs_lookup_extent_tree_start, @@ -1286,7 +1433,7 @@ TRACE_EVENT(f2fs_shrink_extent_tree, ), TP_printk("dev = (%d,%d), shrunk: node_cnt = %u, tree_cnt = %u", - show_dev(__entry), + show_dev(__entry->dev), __entry->node_cnt, __entry->tree_cnt) ); @@ -1333,7 +1480,7 @@ DECLARE_EVENT_CLASS(f2fs_sync_dirty_inodes, ), TP_printk("dev = (%d,%d), %s, dirty count = %lld", - show_dev(__entry), + show_dev(__entry->dev), show_file_type(__entry->type), __entry->count) ); diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index 15048097910f..60d27496c328 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -178,6 +178,23 @@ struct inodes_stat_t { /* Policy provided via an ioctl on the topmost directory */ #define FS_KEY_DESCRIPTOR_SIZE 8 +#define FS_POLICY_FLAGS_PAD_4 0x00 +#define FS_POLICY_FLAGS_PAD_8 0x01 +#define FS_POLICY_FLAGS_PAD_16 0x02 +#define FS_POLICY_FLAGS_PAD_32 0x03 +#define FS_POLICY_FLAGS_PAD_MASK 0x03 +#define FS_POLICY_FLAGS_VALID 0x03 + +/* Encryption algorithms */ +#define FS_ENCRYPTION_MODE_INVALID 0 +#define FS_ENCRYPTION_MODE_AES_256_XTS 1 +#define FS_ENCRYPTION_MODE_AES_256_GCM 2 +#define FS_ENCRYPTION_MODE_AES_256_CBC 3 +#define FS_ENCRYPTION_MODE_AES_256_CTS 4 +#define FS_ENCRYPTION_MODE_AES_128_CBC 5 +#define FS_ENCRYPTION_MODE_AES_128_CTS 6 + + struct fscrypt_policy { __u8 version; __u8 contents_encryption_mode; @@ -190,6 +207,19 @@ struct fscrypt_policy { #define FS_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define FS_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct fscrypt_policy) +/* Parameters for passing an encryption key into the kernel keyring */ +#define FS_KEY_DESC_PREFIX "fscrypt:" +#define FS_KEY_DESC_PREFIX_SIZE 8 + +/* Structure that userspace passes to the kernel keyring */ +#define FS_MAX_KEY_SIZE 64 + +struct fscrypt_key { + __u32 mode; + __u8 raw[FS_MAX_KEY_SIZE]; + __u32 size; +}; + /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) */ diff --git a/mm/util.c b/mm/util.c index d5259b62f8d7..d7b1065644be 100644 --- a/mm/util.c +++ b/mm/util.c @@ -348,6 +348,7 @@ struct address_space *page_mapping(struct page *page) return NULL; return page->mapping; } +EXPORT_SYMBOL(page_mapping); int overcommit_ratio_handler(struct ctl_table *table, int write, void __user *buffer, size_t *lenp, From 823e6ab07b08a19b543bc7a8aaab6c7301d10f7e Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Mon, 8 May 2017 09:33:22 -0700 Subject: [PATCH 504/546] ANDROID: binder: Add tracing for binder priority inheritance. Bug: 34461621 Change-Id: I5ebb1c0c49fd42a89ee250a1d70221f767c82c7c Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 4 ++++ drivers/android/binder_trace.h | 24 ++++++++++++++++++++++++ 2 files changed, 28 insertions(+) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 32a2b2f44691..64f393239f6f 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1154,6 +1154,10 @@ static void binder_do_set_priority(struct task_struct *task, task->pid, desired.prio, to_kernel_prio(policy, priority)); + trace_binder_set_priority(task->tgid, task->pid, task->normal_prio, + to_kernel_prio(policy, priority), + desired.prio); + /* Set the actual priority */ if (task->policy != policy || is_rt_policy(policy)) { struct sched_param params; diff --git a/drivers/android/binder_trace.h b/drivers/android/binder_trace.h index 76e3b9c8a8a2..b11dffc521e8 100644 --- a/drivers/android/binder_trace.h +++ b/drivers/android/binder_trace.h @@ -85,6 +85,30 @@ DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_ioctl_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_write_done); DEFINE_BINDER_FUNCTION_RETURN_EVENT(binder_read_done); +TRACE_EVENT(binder_set_priority, + TP_PROTO(int proc, int thread, unsigned int old_prio, + unsigned int desired_prio, unsigned int new_prio), + TP_ARGS(proc, thread, old_prio, new_prio, desired_prio), + + TP_STRUCT__entry( + __field(int, proc) + __field(int, thread) + __field(unsigned int, old_prio) + __field(unsigned int, new_prio) + __field(unsigned int, desired_prio) + ), + TP_fast_assign( + __entry->proc = proc; + __entry->thread = thread; + __entry->old_prio = old_prio; + __entry->new_prio = new_prio; + __entry->desired_prio = desired_prio; + ), + TP_printk("proc=%d thread=%d old=%d => new=%d desired=%d", + __entry->proc, __entry->thread, __entry->old_prio, + __entry->new_prio, __entry->desired_prio) +); + TRACE_EVENT(binder_wait_for_work, TP_PROTO(bool proc_work, bool transaction_stack, bool thread_todo), TP_ARGS(proc_work, transaction_stack, thread_todo), From b6f477cdc16fe224ff4478b25a1077df4ed690c7 Mon Sep 17 00:00:00 2001 From: Martijn Coenen Date: Thu, 24 Aug 2017 15:23:36 +0200 Subject: [PATCH 505/546] ANDROID: binder: fix transaction leak. If a call to put_user() fails, we failed to properly free a transaction and send a failed reply (if necessary). Bug: 63117588 Test: binderLibTest Change-Id: Ia98db8cd82ce354a4cdc8811c969988d585c7e31 Signed-off-by: Martijn Coenen --- drivers/android/binder.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 64f393239f6f..bc8d9ecfebec 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -2106,6 +2106,26 @@ static void binder_send_failed_reply(struct binder_transaction *t, } } +/** + * binder_cleanup_transaction() - cleans up undelivered transaction + * @t: transaction that needs to be cleaned up + * @reason: reason the transaction wasn't delivered + * @error_code: error to return to caller (if synchronous call) + */ +static void binder_cleanup_transaction(struct binder_transaction *t, + const char *reason, + uint32_t error_code) +{ + if (t->buffer->target_node && !(t->flags & TF_ONE_WAY)) { + binder_send_failed_reply(t, error_code); + } else { + binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, + "undelivered transaction %d, %s\n", + t->debug_id, reason); + binder_free_transaction(t); + } +} + /** * binder_validate_object() - checks for a valid metadata object in a buffer. * @buffer: binder_buffer that we're parsing. @@ -4188,12 +4208,20 @@ static int binder_thread_read(struct binder_proc *proc, if (put_user(cmd, (uint32_t __user *)ptr)) { if (t_from) binder_thread_dec_tmpref(t_from); + + binder_cleanup_transaction(t, "put_user failed", + BR_FAILED_REPLY); + return -EFAULT; } ptr += sizeof(uint32_t); if (copy_to_user(ptr, &tr, sizeof(tr))) { if (t_from) binder_thread_dec_tmpref(t_from); + + binder_cleanup_transaction(t, "copy_to_user failed", + BR_FAILED_REPLY); + return -EFAULT; } ptr += sizeof(tr); @@ -4263,15 +4291,9 @@ static void binder_release_work(struct binder_proc *proc, struct binder_transaction *t; t = container_of(w, struct binder_transaction, work); - if (t->buffer->target_node && - !(t->flags & TF_ONE_WAY)) { - binder_send_failed_reply(t, BR_DEAD_REPLY); - } else { - binder_debug(BINDER_DEBUG_DEAD_TRANSACTION, - "undelivered transaction %d\n", - t->debug_id); - binder_free_transaction(t); - } + + binder_cleanup_transaction(t, "process died.", + BR_DEAD_REPLY); } break; case BINDER_WORK_RETURN_ERROR: { struct binder_error *e = container_of( From 6ef223a680d0f88fdf33d2d085df548be72351d8 Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:31 -0700 Subject: [PATCH 506/546] FROMLIST: tracing: Prepare to add preempt and irq trace events In preparation of adding irqsoff and preemptsoff enable and disable trace events, move required functions and code to make it easier to add these events in a later patch. This patch is just code movement and no functional change. Change-Id: I587d411da5efbc4959bcccd7a05c7a66c231e1e0 Cc: Steven Rostedt Cc: Peter Zijlstra Cc: kernel-team@android.com Link: https://patchwork.kernel.org/patch/9988159/ Signed-off-by: Joel Fernandes --- kernel/trace/trace_irqsoff.c | 100 ++++++++++++++++++++++++++--------- 1 file changed, 74 insertions(+), 26 deletions(-) diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index be3222b7d72e..0e56eace0dde 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,7 @@ #include "trace.h" +#if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -450,64 +451,44 @@ void time_hardirqs_off(unsigned long a0, unsigned long a1) #else /* !CONFIG_PROVE_LOCKING */ -/* - * Stubs: - */ - -void trace_softirqs_on(unsigned long ip) -{ -} - -void trace_softirqs_off(unsigned long ip) -{ -} - -inline void print_irqtrace_events(struct task_struct *curr) -{ -} - /* * We are only interested in hardirq on/off events: */ -void trace_hardirqs_on(void) +static inline void tracer_hardirqs_on(void) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_on); -void trace_hardirqs_off(void) +static inline void tracer_hardirqs_off(void) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, CALLER_ADDR1); } -EXPORT_SYMBOL(trace_hardirqs_off); -__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) stop_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_on_caller); -__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { if (!preempt_trace() && irq_trace()) start_critical_timing(CALLER_ADDR0, caller_addr); } -EXPORT_SYMBOL(trace_hardirqs_off_caller); #endif /* CONFIG_PROVE_LOCKING */ #endif /* CONFIG_IRQSOFF_TRACER */ #ifdef CONFIG_PREEMPT_TRACER -void trace_preempt_on(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) stop_critical_timing(a0, a1); } -void trace_preempt_off(unsigned long a0, unsigned long a1) +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { if (preempt_trace() && !irq_trace()) start_critical_timing(a0, a1); @@ -770,3 +751,70 @@ __init static int init_irqsoff_tracer(void) return 0; } core_initcall(init_irqsoff_tracer); +#endif /* IRQSOFF_TRACER || PREEMPTOFF_TRACER */ + +#ifndef CONFIG_IRQSOFF_TRACER +static inline void tracer_hardirqs_on(void) { } +static inline void tracer_hardirqs_off(void) { } +static inline void tracer_hardirqs_on_caller(unsigned long caller_addr) { } +static inline void tracer_hardirqs_off_caller(unsigned long caller_addr) { } +#endif + +#ifndef CONFIG_PREEMPT_TRACER +static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } +static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } +#endif + +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) +void trace_hardirqs_on(void) +{ + tracer_hardirqs_on(); +} +EXPORT_SYMBOL(trace_hardirqs_on); + +void trace_hardirqs_off(void) +{ + tracer_hardirqs_off(); +} +EXPORT_SYMBOL(trace_hardirqs_off); + +__visible void trace_hardirqs_on_caller(unsigned long caller_addr) +{ + tracer_hardirqs_on_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_on_caller); + +__visible void trace_hardirqs_off_caller(unsigned long caller_addr) +{ + tracer_hardirqs_off_caller(caller_addr); +} +EXPORT_SYMBOL(trace_hardirqs_off_caller); + +/* + * Stubs: + */ + +void trace_softirqs_on(unsigned long ip) +{ +} + +void trace_softirqs_off(unsigned long ip) +{ +} + +inline void print_irqtrace_events(struct task_struct *curr) +{ +} +#endif + +#ifdef CONFIG_PREEMPT_TRACER +void trace_preempt_on(unsigned long a0, unsigned long a1) +{ + tracer_preempt_on(a0, a1); +} + +void trace_preempt_off(unsigned long a0, unsigned long a1) +{ + tracer_preempt_off(a0, a1); +} +#endif From 43227089a4c549b7ab0bfe6c8f8d92b89b363ecd Mon Sep 17 00:00:00 2001 From: Joel Fernandes Date: Thu, 5 Oct 2017 17:54:32 -0700 Subject: [PATCH 507/546] FROMLIST: tracing: Add support for preempt and irq enable/disable events Preempt and irq trace events can be used for tracing the start and end of an atomic section which can be used by a trace viewer like systrace to graphically view the start and end of an atomic section and correlate them with latencies and scheduling issues. This also serves as a prelude to using synthetic events or probes to rewrite the preempt and irqsoff tracers, along with numerous benefits of using trace events features for these events. Change-Id: I718d40f7c3c48579adf9d7121b21495a669c89bd Cc: Steven Rostedt Cc: Peter Zilstra Cc: kernel-team@android.com Link: https://patchwork.kernel.org/patch/9988157/ Signed-off-by: Joel Fernandes --- include/linux/ftrace.h | 3 +- include/trace/events/preemptirq.h | 70 +++++++++++++++++++++++++++++++ kernel/trace/Kconfig | 11 +++++ kernel/trace/Makefile | 1 + kernel/trace/trace_irqsoff.c | 35 +++++++++++++++- 5 files changed, 118 insertions(+), 2 deletions(-) create mode 100644 include/trace/events/preemptirq.h diff --git a/include/linux/ftrace.h b/include/linux/ftrace.h index 60048c50404e..ed94cea9eaff 100644 --- a/include/linux/ftrace.h +++ b/include/linux/ftrace.h @@ -702,7 +702,8 @@ static inline void __ftrace_enabled_restore(int enabled) static inline void time_hardirqs_off(unsigned long a0, unsigned long a1) { } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) extern void trace_preempt_on(unsigned long a0, unsigned long a1); extern void trace_preempt_off(unsigned long a0, unsigned long a1); #else diff --git a/include/trace/events/preemptirq.h b/include/trace/events/preemptirq.h new file mode 100644 index 000000000000..f5024c560d8f --- /dev/null +++ b/include/trace/events/preemptirq.h @@ -0,0 +1,70 @@ +#ifdef CONFIG_PREEMPTIRQ_EVENTS + +#undef TRACE_SYSTEM +#define TRACE_SYSTEM preemptirq + +#if !defined(_TRACE_PREEMPTIRQ_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_PREEMPTIRQ_H + +#include +#include +#include +#include + +DECLARE_EVENT_CLASS(preemptirq_template, + + TP_PROTO(unsigned long ip, unsigned long parent_ip), + + TP_ARGS(ip, parent_ip), + + TP_STRUCT__entry( + __field(u32, caller_offs) + __field(u32, parent_offs) + ), + + TP_fast_assign( + __entry->caller_offs = (u32)(ip - (unsigned long)_stext); + __entry->parent_offs = (u32)(parent_ip - (unsigned long)_stext); + ), + + TP_printk("caller=%pF parent=%pF", + (void *)((unsigned long)(_stext) + __entry->caller_offs), + (void *)((unsigned long)(_stext) + __entry->parent_offs)) +); + +#ifndef CONFIG_PROVE_LOCKING +DEFINE_EVENT(preemptirq_template, irq_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); + +DEFINE_EVENT(preemptirq_template, irq_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); +#endif + +#ifdef CONFIG_DEBUG_PREEMPT +DEFINE_EVENT(preemptirq_template, preempt_disable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); + +DEFINE_EVENT(preemptirq_template, preempt_enable, + TP_PROTO(unsigned long ip, unsigned long parent_ip), + TP_ARGS(ip, parent_ip)); +#endif + +#endif /* _TRACE_PREEMPTIRQ_H */ + +#include + +#else /* !CONFIG_PREEMPTIRQ_EVENTS */ + +#define trace_irq_enable(...) +#define trace_irq_disable(...) +#define trace_preempt_enable(...) +#define trace_preempt_disable(...) +#define trace_irq_enable_rcuidle(...) +#define trace_irq_disable_rcuidle(...) +#define trace_preempt_enable_rcuidle(...) +#define trace_preempt_disable_rcuidle(...) + +#endif diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 5f5b66a2f156..006eefb6ede0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -165,6 +165,17 @@ config FUNCTION_GRAPH_TRACER address on the current task structure into a stack of calls. +config PREEMPTIRQ_EVENTS + bool "Enable trace events for preempt and irq disable/enable" + select TRACE_IRQFLAGS + depends on DEBUG_PREEMPT || !PROVE_LOCKING + default n + help + Enable tracing of disable and enable events for preemption and irqs. + For tracing preempt disable/enable events, DEBUG_PREEMPT must be + enabled. For tracing irq disable/enable events, PROVE_LOCKING must + be disabled. + config IRQSOFF_TRACER bool "Interrupts-off Latency Tracer" default n diff --git a/kernel/trace/Makefile b/kernel/trace/Makefile index a9bba37fab5a..4b35fb97ae44 100644 --- a/kernel/trace/Makefile +++ b/kernel/trace/Makefile @@ -37,6 +37,7 @@ obj-$(CONFIG_TRACING) += trace_stat.o obj-$(CONFIG_TRACING) += trace_printk.o obj-$(CONFIG_CONTEXT_SWITCH_TRACER) += trace_sched_switch.o obj-$(CONFIG_FUNCTION_TRACER) += trace_functions.o +obj-$(CONFIG_PREEMPTIRQ_EVENTS) += trace_irqsoff.o obj-$(CONFIG_IRQSOFF_TRACER) += trace_irqsoff.o obj-$(CONFIG_PREEMPT_TRACER) += trace_irqsoff.o obj-$(CONFIG_SCHED_TRACER) += trace_sched_wakeup.o diff --git a/kernel/trace/trace_irqsoff.c b/kernel/trace/trace_irqsoff.c index 0e56eace0dde..21b162c07e83 100644 --- a/kernel/trace/trace_irqsoff.c +++ b/kernel/trace/trace_irqsoff.c @@ -16,6 +16,9 @@ #include "trace.h" +#define CREATE_TRACE_POINTS +#include + #if defined(CONFIG_IRQSOFF_TRACER) || defined(CONFIG_PREEMPT_TRACER) static struct trace_array *irqsoff_trace __read_mostly; static int tracer_enabled __read_mostly; @@ -765,27 +768,54 @@ static inline void tracer_preempt_on(unsigned long a0, unsigned long a1) { } static inline void tracer_preempt_off(unsigned long a0, unsigned long a1) { } #endif +/* Per-cpu variable to prevent redundant calls when IRQs already off */ +static DEFINE_PER_CPU(int, tracing_irq_cpu); + #if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PROVE_LOCKING) void trace_hardirqs_on(void) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_on(); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on); void trace_hardirqs_off(void) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, CALLER_ADDR1); tracer_hardirqs_off(); } EXPORT_SYMBOL(trace_hardirqs_off); __visible void trace_hardirqs_on_caller(unsigned long caller_addr) { + if (!this_cpu_read(tracing_irq_cpu)) + return; + + trace_irq_enable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_on_caller(caller_addr); + + this_cpu_write(tracing_irq_cpu, 0); } EXPORT_SYMBOL(trace_hardirqs_on_caller); __visible void trace_hardirqs_off_caller(unsigned long caller_addr) { + if (this_cpu_read(tracing_irq_cpu)) + return; + + this_cpu_write(tracing_irq_cpu, 1); + + trace_irq_disable_rcuidle(CALLER_ADDR0, caller_addr); tracer_hardirqs_off_caller(caller_addr); } EXPORT_SYMBOL(trace_hardirqs_off_caller); @@ -807,14 +837,17 @@ inline void print_irqtrace_events(struct task_struct *curr) } #endif -#ifdef CONFIG_PREEMPT_TRACER +#if defined(CONFIG_PREEMPT_TRACER) || \ + (defined(CONFIG_DEBUG_PREEMPT) && defined(CONFIG_PREEMPTIRQ_EVENTS)) void trace_preempt_on(unsigned long a0, unsigned long a1) { + trace_preempt_enable_rcuidle(a0, a1); tracer_preempt_on(a0, a1); } void trace_preempt_off(unsigned long a0, unsigned long a1) { + trace_preempt_disable_rcuidle(a0, a1); tracer_preempt_off(a0, a1); } #endif From 0bae34f5abca8c7a29d576cfe844d4b52dd6dc4d Mon Sep 17 00:00:00 2001 From: David Ahern Date: Wed, 16 Dec 2015 13:20:43 -0800 Subject: [PATCH 508/546] UPSTREAM: net: l3mdev: Add master device lookup by index Add helper to lookup l3mdev master index given a device index. [cherry-pick of upstream 1a8524794fc7c70f44ac28e3a6e8fd637bc41f14] Bug: 63589535 Change-Id: I3d0758a5d0eb03791726014c9c1e32e187391e6f Signed-off-by: David Ahern Signed-off-by: David S. Miller --- include/net/l3mdev.h | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/include/net/l3mdev.h b/include/net/l3mdev.h index 5689a0c749f7..5567d46b3cff 100644 --- a/include/net/l3mdev.h +++ b/include/net/l3mdev.h @@ -51,6 +51,24 @@ static inline int l3mdev_master_ifindex(struct net_device *dev) return ifindex; } +static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) +{ + struct net_device *dev; + int rc = 0; + + if (likely(ifindex)) { + rcu_read_lock(); + + dev = dev_get_by_index_rcu(net, ifindex); + if (dev) + rc = l3mdev_master_ifindex_rcu(dev); + + rcu_read_unlock(); + } + + return rc; +} + /* get index of an interface to use for FIB lookups. For devices * enslaved to an L3 master device FIB lookups are based on the * master index @@ -170,6 +188,11 @@ static inline int l3mdev_master_ifindex(struct net_device *dev) return 0; } +static inline int l3mdev_master_ifindex_by_index(struct net *net, int ifindex) +{ + return 0; +} + static inline int l3mdev_fib_oif_rcu(struct net_device *dev) { return dev ? dev->ifindex : 0; From 4e634bfe2996f2feca6ef91df9a13797c6d9001b Mon Sep 17 00:00:00 2001 From: David Ahern Date: Sun, 14 Aug 2016 19:52:56 -0700 Subject: [PATCH 509/546] UPSTREAM: xfrm: Only add l3mdev oif to dst lookups Subash reported that commit 42a7b32b73d6 ("xfrm: Add oif to dst lookups") broke a wifi use case that uses fib rules and xfrms. The intent of 42a7b32b73d6 was driven by VRFs with IPsec. As a compromise relax the use of oif in xfrm lookups to L3 master devices only (ie., oif is either an L3 master device or is enslaved to a master device). [cherry-pick of upstream 11d7a0bb95eaaba1741bb24a7c3c169c82f09c7b] Bug: 63589535 Change-Id: Ibadb15341f6c6c7077eccfaa2c66b3bb86b251bf Fixes: 42a7b32b73d6 ("xfrm: Add oif to dst lookups") Reported-by: Subash Abhinov Kasiviswanathan Signed-off-by: David Ahern Signed-off-by: Steffen Klassert --- net/ipv4/xfrm4_policy.c | 2 +- net/ipv6/xfrm6_policy.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index 7b0edb37a115..e07ed8b1deb3 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -29,7 +29,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, memset(fl4, 0, sizeof(*fl4)); fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; - fl4->flowi4_oif = oif; + fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); if (saddr) fl4->saddr = saddr->a4; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index c074771a10f7..dd84ecd1221b 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -36,7 +36,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, int err; memset(&fl6, 0, sizeof(fl6)); - fl6.flowi6_oif = oif; + fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) From b9a1a573841bfcda5dfc400b68bf1d2bb60e6bcb Mon Sep 17 00:00:00 2001 From: Lorenzo Colitti Date: Fri, 11 Aug 2017 02:11:33 +0900 Subject: [PATCH 510/546] BACKPORT: net: xfrm: support setting an output mark. On systems that use mark-based routing it may be necessary for routing lookups to use marks in order for packets to be routed correctly. An example of such a system is Android, which uses socket marks to route packets via different networks. Currently, routing lookups in tunnel mode always use a mark of zero, making routing incorrect on such systems. This patch adds a new output_mark element to the xfrm state and a corresponding XFRMA_OUTPUT_MARK netlink attribute. The output mark differs from the existing xfrm mark in two ways: 1. The xfrm mark is used to match xfrm policies and states, while the xfrm output mark is used to set the mark (and influence the routing) of the packets emitted by those states. 2. The existing mark is constrained to be a subset of the bits of the originating socket or transformed packet, but the output mark is arbitrary and depends only on the state. The use of a separate mark provides additional flexibility. For example: - A packet subject to two transforms (e.g., transport mode inside tunnel mode) can have two different output marks applied to it, one for the transport mode SA and one for the tunnel mode SA. - On a system where socket marks determine routing, the packets emitted by an IPsec tunnel can be routed based on a mark that is determined by the tunnel, not by the marks of the unencrypted packets. - Support for setting the output marks can be introduced without breaking any existing setups that employ both mark-based routing and xfrm tunnel mode. Simply changing the code to use the xfrm mark for routing output packets could xfrm mark could change behaviour in a way that breaks these setups. If the output mark is unspecified or set to zero, the mark is not set or changed. [backport of upstream 077fbac405bfc6d41419ad6c1725804ad4e9887c] Bug: 63589535 Test: https://android-review.googlesource.com/452776/ passes Tested: make allyesconfig; make -j64 Tested: https://android-review.googlesource.com/452776 Signed-off-by: Lorenzo Colitti Signed-off-by: Steffen Klassert Change-Id: I76120fba036e21780ced31ad390faf491ea81e52 --- include/net/xfrm.h | 7 +++++-- include/uapi/linux/xfrm.h | 3 +++ net/ipv4/xfrm4_policy.c | 14 +++++++++----- net/ipv6/xfrm6_policy.c | 9 ++++++--- net/xfrm/xfrm_output.c | 3 +++ net/xfrm/xfrm_policy.c | 17 +++++++++-------- net/xfrm/xfrm_user.c | 11 +++++++++++ 7 files changed, 46 insertions(+), 18 deletions(-) diff --git a/include/net/xfrm.h b/include/net/xfrm.h index 185fb037b332..9e1325e36415 100644 --- a/include/net/xfrm.h +++ b/include/net/xfrm.h @@ -159,6 +159,7 @@ struct xfrm_state { int header_len; int trailer_len; u32 extra_flags; + u32 output_mark; } props; struct xfrm_lifetime_cfg lft; @@ -288,10 +289,12 @@ struct xfrm_policy_afinfo { struct dst_entry *(*dst_lookup)(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr); + const xfrm_address_t *daddr, + u32 mark); int (*get_saddr)(struct net *net, int oif, xfrm_address_t *saddr, - xfrm_address_t *daddr); + xfrm_address_t *daddr, + u32 mark); void (*decode_session)(struct sk_buff *skb, struct flowi *fl, int reverse); diff --git a/include/uapi/linux/xfrm.h b/include/uapi/linux/xfrm.h index 2cd9e608d0d1..8cb0f24a33f0 100644 --- a/include/uapi/linux/xfrm.h +++ b/include/uapi/linux/xfrm.h @@ -302,6 +302,9 @@ enum xfrm_attr_type_t { XFRMA_SA_EXTRA_FLAGS, /* __u32 */ XFRMA_PROTO, /* __u8 */ XFRMA_ADDRESS_FILTER, /* struct xfrm_address_filter */ + XFRMA_PAD, + XFRMA_OFFLOAD_DEV, /* struct xfrm_state_offload */ + XFRMA_OUTPUT_MARK, /* __u32 */ __XFRMA_MAX #define XFRMA_MAX (__XFRMA_MAX - 1) diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c index e07ed8b1deb3..39eebc7b2831 100644 --- a/net/ipv4/xfrm4_policy.c +++ b/net/ipv4/xfrm4_policy.c @@ -22,7 +22,8 @@ static struct xfrm_policy_afinfo xfrm4_policy_afinfo; static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct rtable *rt; @@ -30,6 +31,7 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, fl4->daddr = daddr->a4; fl4->flowi4_tos = tos; fl4->flowi4_oif = l3mdev_master_ifindex_by_index(net, oif); + fl4->flowi4_mark = mark; if (saddr) fl4->saddr = saddr->a4; @@ -44,20 +46,22 @@ static struct dst_entry *__xfrm4_dst_lookup(struct net *net, struct flowi4 *fl4, static struct dst_entry *xfrm4_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi4 fl4; - return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr); + return __xfrm4_dst_lookup(net, &fl4, tos, oif, saddr, daddr, mark); } static int xfrm4_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct flowi4 fl4; - dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr); + dst = __xfrm4_dst_lookup(net, &fl4, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c index dd84ecd1221b..1a8608cc104c 100644 --- a/net/ipv6/xfrm6_policy.c +++ b/net/ipv6/xfrm6_policy.c @@ -29,7 +29,8 @@ static struct xfrm_policy_afinfo xfrm6_policy_afinfo; static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, - const xfrm_address_t *daddr) + const xfrm_address_t *daddr, + u32 mark) { struct flowi6 fl6; struct dst_entry *dst; @@ -38,6 +39,7 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, memset(&fl6, 0, sizeof(fl6)); fl6.flowi6_oif = l3mdev_master_ifindex_by_index(net, oif); fl6.flowi6_flags = FLOWI_FLAG_SKIP_NH_OIF; + fl6.flowi6_mark = mark; memcpy(&fl6.daddr, daddr, sizeof(fl6.daddr)); if (saddr) memcpy(&fl6.saddr, saddr, sizeof(fl6.saddr)); @@ -54,12 +56,13 @@ static struct dst_entry *xfrm6_dst_lookup(struct net *net, int tos, int oif, } static int xfrm6_get_saddr(struct net *net, int oif, - xfrm_address_t *saddr, xfrm_address_t *daddr) + xfrm_address_t *saddr, xfrm_address_t *daddr, + u32 mark) { struct dst_entry *dst; struct net_device *dev; - dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr); + dst = xfrm6_dst_lookup(net, 0, oif, NULL, daddr, mark); if (IS_ERR(dst)) return -EHOSTUNREACH; diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c index ff4a91fcab9f..16e828f2540f 100644 --- a/net/xfrm/xfrm_output.c +++ b/net/xfrm/xfrm_output.c @@ -66,6 +66,9 @@ static int xfrm_output_one(struct sk_buff *skb, int err) goto error_nolock; } + if (x->props.output_mark) + skb->mark = x->props.output_mark; + err = x->outer_mode->output(x, skb); if (err) { XFRM_INC_STATS(net, LINUX_MIB_XFRMOUTSTATEMODEERROR); diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c index 0e01250f2072..4096f699ba00 100644 --- a/net/xfrm/xfrm_policy.c +++ b/net/xfrm/xfrm_policy.c @@ -119,7 +119,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, int tos, int oif, const xfrm_address_t *saddr, const xfrm_address_t *daddr, - int family) + int family, u32 mark) { struct xfrm_policy_afinfo *afinfo; struct dst_entry *dst; @@ -128,7 +128,7 @@ static inline struct dst_entry *__xfrm_dst_lookup(struct net *net, if (unlikely(afinfo == NULL)) return ERR_PTR(-EAFNOSUPPORT); - dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr); + dst = afinfo->dst_lookup(net, tos, oif, saddr, daddr, mark); xfrm_policy_put_afinfo(afinfo); @@ -139,7 +139,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, int tos, int oif, xfrm_address_t *prev_saddr, xfrm_address_t *prev_daddr, - int family) + int family, u32 mark) { struct net *net = xs_net(x); xfrm_address_t *saddr = &x->props.saddr; @@ -155,7 +155,7 @@ static inline struct dst_entry *xfrm_dst_lookup(struct xfrm_state *x, daddr = x->coaddr; } - dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family); + dst = __xfrm_dst_lookup(net, tos, oif, saddr, daddr, family, mark); if (!IS_ERR(dst)) { if (prev_saddr != saddr) @@ -1395,14 +1395,14 @@ int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk) static int xfrm_get_saddr(struct net *net, int oif, xfrm_address_t *local, - xfrm_address_t *remote, unsigned short family) + xfrm_address_t *remote, unsigned short family, u32 mark) { int err; struct xfrm_policy_afinfo *afinfo = xfrm_policy_get_afinfo(family); if (unlikely(afinfo == NULL)) return -EINVAL; - err = afinfo->get_saddr(net, oif, local, remote); + err = afinfo->get_saddr(net, oif, local, remote, mark); xfrm_policy_put_afinfo(afinfo); return err; } @@ -1433,7 +1433,7 @@ xfrm_tmpl_resolve_one(struct xfrm_policy *policy, const struct flowi *fl, if (xfrm_addr_any(local, tmpl->encap_family)) { error = xfrm_get_saddr(net, fl->flowi_oif, &tmp, remote, - tmpl->encap_family); + tmpl->encap_family, 0); if (error) goto fail; local = &tmp; @@ -1712,7 +1712,8 @@ static struct dst_entry *xfrm_bundle_create(struct xfrm_policy *policy, if (xfrm[i]->props.mode != XFRM_MODE_TRANSPORT) { family = xfrm[i]->props.family; dst = xfrm_dst_lookup(xfrm[i], tos, fl->flowi_oif, - &saddr, &daddr, family); + &saddr, &daddr, family, + xfrm[i]->props.output_mark); err = PTR_ERR(dst); if (IS_ERR(dst)) goto put_states; diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c index 4c696d4d5ce3..68010a01ea36 100644 --- a/net/xfrm/xfrm_user.c +++ b/net/xfrm/xfrm_user.c @@ -584,6 +584,9 @@ static struct xfrm_state *xfrm_state_construct(struct net *net, xfrm_mark_get(attrs, &x->mark); + if (attrs[XFRMA_OUTPUT_MARK]) + x->props.output_mark = nla_get_u32(attrs[XFRMA_OUTPUT_MARK]); + err = __xfrm_init_state(x, false); if (err) goto error; @@ -867,6 +870,11 @@ static int copy_to_user_state_extra(struct xfrm_state *x, goto out; if (x->security) ret = copy_sec_ctx(x->security, skb); + if (x->props.output_mark) { + ret = nla_put_u32(skb, XFRMA_OUTPUT_MARK, x->props.output_mark); + if (ret) + goto out; + } out: return ret; } @@ -2419,6 +2427,7 @@ static const struct nla_policy xfrma_policy[XFRMA_MAX+1] = { [XFRMA_SA_EXTRA_FLAGS] = { .type = NLA_U32 }, [XFRMA_PROTO] = { .type = NLA_U8 }, [XFRMA_ADDRESS_FILTER] = { .len = sizeof(struct xfrm_address_filter) }, + [XFRMA_OUTPUT_MARK] = { .len = NLA_U32 }, }; static const struct nla_policy xfrma_spd_policy[XFRMA_SPD_MAX+1] = { @@ -2635,6 +2644,8 @@ static inline size_t xfrm_sa_len(struct xfrm_state *x) l += nla_total_size(sizeof(*x->coaddr)); if (x->props.extra_flags) l += nla_total_size(sizeof(x->props.extra_flags)); + if (x->props.output_mark) + l += nla_total_size(sizeof(x->props.output_mark)); /* Must count x->lastused as it may become non-zero behind our back. */ l += nla_total_size(sizeof(u64)); From b8b02687f82b3b8c67abf1a2996ea018829637cc Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Wed, 27 Sep 2017 15:12:25 +0800 Subject: [PATCH 511/546] ANDROID: binder: init desired_prio.sched_policy before use it In function binder_transaction_priority(), we access desired_prio before initialzing it. This patch fix this. Change-Id: I9d14d50f9a128010476a65b52631630899a44633 Signed-off-by: Ganesh Mahendran --- drivers/android/binder.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index bc8d9ecfebec..49b8ad929bfe 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1189,7 +1189,7 @@ static void binder_transaction_priority(struct task_struct *task, struct binder_priority node_prio, bool inherit_rt) { - struct binder_priority desired_prio; + struct binder_priority desired_prio = t->priority; if (t->set_priority_called) return; @@ -1201,9 +1201,6 @@ static void binder_transaction_priority(struct task_struct *task, if (!inherit_rt && is_rt_policy(desired_prio.sched_policy)) { desired_prio.prio = NICE_TO_PRIO(0); desired_prio.sched_policy = SCHED_NORMAL; - } else { - desired_prio.prio = t->priority.prio; - desired_prio.sched_policy = t->priority.sched_policy; } if (node_prio.prio < t->priority.prio || From b5704b7ff522b99d463e5d744601c4c8dbb85afd Mon Sep 17 00:00:00 2001 From: Ganesh Mahendran Date: Tue, 26 Sep 2017 17:56:25 +0800 Subject: [PATCH 512/546] ANDROID: binder: fix node sched policy calculation We should use FLAT_BINDER_FLAG_SCHED_POLICY_MASK as the mask to calculate sched policy. Change-Id: Ic252fd7c68495830690130d792802c02f99fc8fc Signed-off-by: Ganesh Mahendran --- drivers/android/binder.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/android/binder.c b/drivers/android/binder.c index 49b8ad929bfe..112e61a4806b 100644 --- a/drivers/android/binder.c +++ b/drivers/android/binder.c @@ -1303,7 +1303,7 @@ static struct binder_node *binder_init_node_ilocked( node->cookie = cookie; node->work.type = BINDER_WORK_NODE; priority = flags & FLAT_BINDER_FLAG_PRIORITY_MASK; - node->sched_policy = (flags & FLAT_BINDER_FLAG_PRIORITY_MASK) >> + node->sched_policy = (flags & FLAT_BINDER_FLAG_SCHED_POLICY_MASK) >> FLAT_BINDER_FLAG_SCHED_POLICY_SHIFT; node->min_priority = to_kernel_prio(node->sched_policy, priority); node->accept_fds = !!(flags & FLAT_BINDER_FLAG_ACCEPTS_FDS); From 1bd8cd428b5a23e418c92b9b5499aba26c90939f Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Wed, 4 Oct 2017 16:56:43 -0700 Subject: [PATCH 513/546] ANDROID: fscrypt: remove unnecessary fscrypto.h This patch removes fscrypto.h which was missed to remove when porting the latest fs/crypto stuffs. Change-Id: Ib17cba17d48535b0d754ae52537053098248a036 Fixes: 13f002354db13a02 ("f2fs: catch up to v4.14-rc1") Signed-off-by: Jaegeuk Kim --- include/linux/fscrypto.h | 411 --------------------------------------- 1 file changed, 411 deletions(-) delete mode 100644 include/linux/fscrypto.h diff --git a/include/linux/fscrypto.h b/include/linux/fscrypto.h deleted file mode 100644 index e6e53a36104b..000000000000 --- a/include/linux/fscrypto.h +++ /dev/null @@ -1,411 +0,0 @@ -/* - * General per-file encryption definition - * - * Copyright (C) 2015, Google, Inc. - * - * Written by Michael Halcrow, 2015. - * Modified by Jaegeuk Kim, 2015. - */ - -#ifndef _LINUX_FSCRYPTO_H -#define _LINUX_FSCRYPTO_H - -#include -#include -#include -#include -#include -#include -#include - -#define FS_KEY_DERIVATION_NONCE_SIZE 16 -#define FS_ENCRYPTION_CONTEXT_FORMAT_V1 1 - -#define FS_POLICY_FLAGS_PAD_4 0x00 -#define FS_POLICY_FLAGS_PAD_8 0x01 -#define FS_POLICY_FLAGS_PAD_16 0x02 -#define FS_POLICY_FLAGS_PAD_32 0x03 -#define FS_POLICY_FLAGS_PAD_MASK 0x03 -#define FS_POLICY_FLAGS_VALID 0x03 - -/* Encryption algorithms */ -#define FS_ENCRYPTION_MODE_INVALID 0 -#define FS_ENCRYPTION_MODE_AES_256_XTS 1 -#define FS_ENCRYPTION_MODE_AES_256_GCM 2 -#define FS_ENCRYPTION_MODE_AES_256_CBC 3 -#define FS_ENCRYPTION_MODE_AES_256_CTS 4 - -/** - * Encryption context for inode - * - * Protector format: - * 1 byte: Protector format (1 = this version) - * 1 byte: File contents encryption mode - * 1 byte: File names encryption mode - * 1 byte: Flags - * 8 bytes: Master Key descriptor - * 16 bytes: Encryption Key derivation nonce - */ -struct fscrypt_context { - u8 format; - u8 contents_encryption_mode; - u8 filenames_encryption_mode; - u8 flags; - u8 master_key_descriptor[FS_KEY_DESCRIPTOR_SIZE]; - u8 nonce[FS_KEY_DERIVATION_NONCE_SIZE]; -} __packed; - -/* Encryption parameters */ -#define FS_XTS_TWEAK_SIZE 16 -#define FS_AES_128_ECB_KEY_SIZE 16 -#define FS_AES_256_GCM_KEY_SIZE 32 -#define FS_AES_256_CBC_KEY_SIZE 32 -#define FS_AES_256_CTS_KEY_SIZE 32 -#define FS_AES_256_XTS_KEY_SIZE 64 -#define FS_MAX_KEY_SIZE 64 - -#define FS_KEY_DESC_PREFIX "fscrypt:" -#define FS_KEY_DESC_PREFIX_SIZE 8 - -/* This is passed in from userspace into the kernel keyring */ -struct fscrypt_key { - u32 mode; - u8 raw[FS_MAX_KEY_SIZE]; - u32 size; -} __packed; - -struct fscrypt_info { - u8 ci_data_mode; - u8 ci_filename_mode; - u8 ci_flags; - struct crypto_skcipher *ci_ctfm; - struct key *ci_keyring_key; - u8 ci_master_key[FS_KEY_DESCRIPTOR_SIZE]; -}; - -#define FS_CTX_REQUIRES_FREE_ENCRYPT_FL 0x00000001 -#define FS_WRITE_PATH_FL 0x00000002 - -struct fscrypt_ctx { - union { - struct { - struct page *bounce_page; /* Ciphertext page */ - struct page *control_page; /* Original page */ - } w; - struct { - struct bio *bio; - struct work_struct work; - } r; - struct list_head free_list; /* Free list */ - }; - u8 flags; /* Flags */ - u8 mode; /* Encryption mode for tfm */ -}; - -struct fscrypt_completion_result { - struct completion completion; - int res; -}; - -#define DECLARE_FS_COMPLETION_RESULT(ecr) \ - struct fscrypt_completion_result ecr = { \ - COMPLETION_INITIALIZER((ecr).completion), 0 } - -#define FS_FNAME_NUM_SCATTER_ENTRIES 4 -#define FS_CRYPTO_BLOCK_SIZE 16 -#define FS_FNAME_CRYPTO_DIGEST_SIZE 32 - -/** - * For encrypted symlinks, the ciphertext length is stored at the beginning - * of the string in little-endian format. - */ -struct fscrypt_symlink_data { - __le16 len; - char encrypted_path[1]; -} __packed; - -/** - * This function is used to calculate the disk space required to - * store a filename of length l in encrypted symlink format. - */ -static inline u32 fscrypt_symlink_data_len(u32 l) -{ - if (l < FS_CRYPTO_BLOCK_SIZE) - l = FS_CRYPTO_BLOCK_SIZE; - return (l + sizeof(struct fscrypt_symlink_data) - 1); -} - -struct fscrypt_str { - unsigned char *name; - u32 len; -}; - -struct fscrypt_name { - const struct qstr *usr_fname; - struct fscrypt_str disk_name; - u32 hash; - u32 minor_hash; - struct fscrypt_str crypto_buf; -}; - -#define FSTR_INIT(n, l) { .name = n, .len = l } -#define FSTR_TO_QSTR(f) QSTR_INIT((f)->name, (f)->len) -#define fname_name(p) ((p)->disk_name.name) -#define fname_len(p) ((p)->disk_name.len) - -/* - * crypto opertions for filesystems - */ -struct fscrypt_operations { - int (*get_context)(struct inode *, void *, size_t); - int (*key_prefix)(struct inode *, u8 **); - int (*prepare_context)(struct inode *); - int (*set_context)(struct inode *, const void *, size_t, void *); - int (*dummy_context)(struct inode *); - bool (*is_encrypted)(struct inode *); - bool (*empty_dir)(struct inode *); - unsigned (*max_namelen)(struct inode *); -}; - -static inline bool fscrypt_dummy_context_enabled(struct inode *inode) -{ - if (inode->i_sb->s_cop->dummy_context && - inode->i_sb->s_cop->dummy_context(inode)) - return true; - return false; -} - -static inline bool fscrypt_valid_contents_enc_mode(u32 mode) -{ - return (mode == FS_ENCRYPTION_MODE_AES_256_XTS); -} - -static inline bool fscrypt_valid_filenames_enc_mode(u32 mode) -{ - return (mode == FS_ENCRYPTION_MODE_AES_256_CTS); -} - -static inline bool fscrypt_is_dot_dotdot(const struct qstr *str) -{ - if (str->len == 1 && str->name[0] == '.') - return true; - - if (str->len == 2 && str->name[0] == '.' && str->name[1] == '.') - return true; - - return false; -} - -static inline struct page *fscrypt_control_page(struct page *page) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - return ((struct fscrypt_ctx *)page_private(page))->w.control_page; -#else - WARN_ON_ONCE(1); - return ERR_PTR(-EINVAL); -#endif -} - -static inline int fscrypt_has_encryption_key(struct inode *inode) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - return (inode->i_crypt_info != NULL); -#else - return 0; -#endif -} - -static inline void fscrypt_set_encrypted_dentry(struct dentry *dentry) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - spin_lock(&dentry->d_lock); - dentry->d_flags |= DCACHE_ENCRYPTED_WITH_KEY; - spin_unlock(&dentry->d_lock); -#endif -} - -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) -extern const struct dentry_operations fscrypt_d_ops; -#endif - -static inline void fscrypt_set_d_op(struct dentry *dentry) -{ -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) - d_set_d_op(dentry, &fscrypt_d_ops); -#endif -} - -#if IS_ENABLED(CONFIG_FS_ENCRYPTION) -/* crypto.c */ -extern struct kmem_cache *fscrypt_info_cachep; -int fscrypt_initialize(void); - -extern struct fscrypt_ctx *fscrypt_get_ctx(struct inode *, gfp_t); -extern void fscrypt_release_ctx(struct fscrypt_ctx *); -extern struct page *fscrypt_encrypt_page(struct inode *, struct page *, gfp_t); -extern int fscrypt_decrypt_page(struct page *); -extern void fscrypt_decrypt_bio_pages(struct fscrypt_ctx *, struct bio *); -extern void fscrypt_pullback_bio_page(struct page **, bool); -extern void fscrypt_restore_control_page(struct page *); -extern int fscrypt_zeroout_range(struct inode *, pgoff_t, sector_t, - unsigned int); -/* policy.c */ -extern int fscrypt_ioctl_set_policy(struct file *, const void __user *); -extern int fscrypt_ioctl_get_policy(struct file *, void __user *); -extern int fscrypt_has_permitted_context(struct inode *, struct inode *); -extern int fscrypt_inherit_context(struct inode *, struct inode *, - void *, bool); -/* keyinfo.c */ -extern int get_crypt_info(struct inode *); -extern int fscrypt_get_encryption_info(struct inode *); -extern void fscrypt_put_encryption_info(struct inode *, struct fscrypt_info *); - -/* fname.c */ -extern int fscrypt_setup_filename(struct inode *, const struct qstr *, - int lookup, struct fscrypt_name *); -extern void fscrypt_free_filename(struct fscrypt_name *); -extern u32 fscrypt_fname_encrypted_size(struct inode *, u32); -extern int fscrypt_fname_alloc_buffer(struct inode *, u32, - struct fscrypt_str *); -extern void fscrypt_fname_free_buffer(struct fscrypt_str *); -extern int fscrypt_fname_disk_to_usr(struct inode *, u32, u32, - const struct fscrypt_str *, struct fscrypt_str *); -extern int fscrypt_fname_usr_to_disk(struct inode *, const struct qstr *, - struct fscrypt_str *); -#endif - -/* crypto.c */ -static inline struct fscrypt_ctx *fscrypt_notsupp_get_ctx(struct inode *i, - gfp_t f) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static inline void fscrypt_notsupp_release_ctx(struct fscrypt_ctx *c) -{ - return; -} - -static inline struct page *fscrypt_notsupp_encrypt_page(struct inode *i, - struct page *p, gfp_t f) -{ - return ERR_PTR(-EOPNOTSUPP); -} - -static inline int fscrypt_notsupp_decrypt_page(struct page *p) -{ - return -EOPNOTSUPP; -} - -static inline void fscrypt_notsupp_decrypt_bio_pages(struct fscrypt_ctx *c, - struct bio *b) -{ - return; -} - -static inline void fscrypt_notsupp_pullback_bio_page(struct page **p, bool b) -{ - return; -} - -static inline void fscrypt_notsupp_restore_control_page(struct page *p) -{ - return; -} - -static inline int fscrypt_notsupp_zeroout_range(struct inode *i, pgoff_t p, - sector_t s, unsigned int f) -{ - return -EOPNOTSUPP; -} - -/* policy.c */ -static inline int fscrypt_notsupp_ioctl_set_policy(struct file *f, - const void __user *arg) -{ - return -EOPNOTSUPP; -} - -static inline int fscrypt_notsupp_ioctl_get_policy(struct file *f, - void __user *arg) -{ - return -EOPNOTSUPP; -} - -static inline int fscrypt_notsupp_has_permitted_context(struct inode *p, - struct inode *i) -{ - return 0; -} - -static inline int fscrypt_notsupp_inherit_context(struct inode *p, - struct inode *i, void *v, bool b) -{ - return -EOPNOTSUPP; -} - -/* keyinfo.c */ -static inline int fscrypt_notsupp_get_encryption_info(struct inode *i) -{ - return -EOPNOTSUPP; -} - -static inline void fscrypt_notsupp_put_encryption_info(struct inode *i, - struct fscrypt_info *f) -{ - return; -} - - /* fname.c */ -static inline int fscrypt_notsupp_setup_filename(struct inode *dir, - const struct qstr *iname, - int lookup, struct fscrypt_name *fname) -{ - if (dir->i_sb->s_cop->is_encrypted(dir)) - return -EOPNOTSUPP; - - memset(fname, 0, sizeof(struct fscrypt_name)); - fname->usr_fname = iname; - fname->disk_name.name = (unsigned char *)iname->name; - fname->disk_name.len = iname->len; - return 0; -} - -static inline void fscrypt_notsupp_free_filename(struct fscrypt_name *fname) -{ - return; -} - -static inline u32 fscrypt_notsupp_fname_encrypted_size(struct inode *i, u32 s) -{ - /* never happens */ - WARN_ON(1); - return 0; -} - -static inline int fscrypt_notsupp_fname_alloc_buffer(struct inode *inode, - u32 ilen, struct fscrypt_str *crypto_str) -{ - return -EOPNOTSUPP; -} - -static inline void fscrypt_notsupp_fname_free_buffer(struct fscrypt_str *c) -{ - return; -} - -static inline int fscrypt_notsupp_fname_disk_to_usr(struct inode *inode, - u32 hash, u32 minor_hash, - const struct fscrypt_str *iname, - struct fscrypt_str *oname) -{ - return -EOPNOTSUPP; -} - -static inline int fscrypt_notsupp_fname_usr_to_disk(struct inode *inode, - const struct qstr *iname, - struct fscrypt_str *oname) -{ - return -EOPNOTSUPP; -} -#endif /* _LINUX_FSCRYPTO_H */ From 00034f46044cd96d882b48d3e6e20ae38a1c80fd Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Mon, 2 Oct 2017 02:50:16 +0800 Subject: [PATCH 514/546] UPSTREAM: f2fs: fix potential panic during fstrim As Ju Hyung Park reported: "When 'fstrim' is called for manual trim, a BUG() can be triggered randomly with this patch. I'm seeing this issue on both x86 Desktop and arm64 Android phone. On x86 Desktop, this was caused during Ubuntu boot-up. I have a cronjob installed which calls 'fstrim -v /' during boot. On arm64 Android, this was caused during GC looping with 1ms gc_min_sleep_time & gc_max_sleep_time." Root cause of this issue is that f2fs_wait_discard_bios can only be used by f2fs_put_super, because during put_super there must be no other referrers, so it can ignore discard entry's reference count when removing the entry, otherwise in other caller we will hit bug_on in __remove_discard_cmd as there may be other issuer added reference count in discard entry. Thread A Thread B - issue_discard_thread - f2fs_ioc_fitrim - f2fs_trim_fs - f2fs_wait_discard_bios - __issue_discard_cmd - __submit_discard_cmd - __wait_discard_cmd - dc->ref++ - __wait_one_discard_bio - __wait_discard_cmd - __remove_discard_cmd - f2fs_bug_on(sbi, dc->ref) Change-Id: I8fb5c8215e6222ae853e7781218d5084e1f11166 Fixes: 969d1b180d987c2be02de890d0fff0f66a0e80de Reported-by: Ju Hyung Park Signed-off-by: Chao Yu Signed-off-by: Jaegeuk Kim (cherry picked from commit 638164a2718f337ea224b747cf5977ef143166a4) --- fs/f2fs/f2fs.h | 2 +- fs/f2fs/segment.c | 6 +++--- fs/f2fs/super.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index ff694127243a..dd840f60e172 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -2606,7 +2606,7 @@ void invalidate_blocks(struct f2fs_sb_info *sbi, block_t addr); bool is_checkpointed_data(struct f2fs_sb_info *sbi, block_t blkaddr); void refresh_sit_entry(struct f2fs_sb_info *sbi, block_t old, block_t new); void stop_discard_thread(struct f2fs_sb_info *sbi); -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi); +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount); void clear_prefree_segments(struct f2fs_sb_info *sbi, struct cp_control *cpc); void release_discard_addrs(struct f2fs_sb_info *sbi); int npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 059a219b7740..f5c494389483 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -1290,11 +1290,11 @@ void stop_discard_thread(struct f2fs_sb_info *sbi) } /* This comes from f2fs_put_super and f2fs_trim_fs */ -void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi) +void f2fs_wait_discard_bios(struct f2fs_sb_info *sbi, bool umount) { __issue_discard_cmd(sbi, false); __drop_discard_cmd(sbi); - __wait_discard_cmd(sbi, false); + __wait_discard_cmd(sbi, !umount); } static void mark_discard_range_all(struct f2fs_sb_info *sbi) @@ -2324,7 +2324,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range) } /* It's time to issue all the filed discards */ mark_discard_range_all(sbi); - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, false); out: range->len = F2FS_BLK_TO_BYTES(cpc.trimmed); return err; diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 315e59ad1483..482bb0333806 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -801,7 +801,7 @@ static void f2fs_put_super(struct super_block *sb) } /* be sure to wait for any on-going discard commands */ - f2fs_wait_discard_bios(sbi); + f2fs_wait_discard_bios(sbi, true); if (f2fs_discard_en(sbi) && !sbi->discard_blks) { struct cp_control cpc = { From c7fa8b72faf531d137ef2a7e103911f215a5fe07 Mon Sep 17 00:00:00 2001 From: David Miller Date: Fri, 2 Jun 2017 11:28:54 -0400 Subject: [PATCH 515/546] crypto: Work around deallocated stack frame reference gcc bug on sparc. commit d41519a69b35b10af7fda867fb9100df24fdf403 upstream. On sparc, if we have an alloca() like situation, as is the case with SHASH_DESC_ON_STACK(), we can end up referencing deallocated stack memory. The result can be that the value is clobbered if a trap or interrupt arrives at just the right instruction. It only occurs if the function ends returning a value from that alloca() area and that value can be placed into the return value register using a single instruction. For example, in lib/libcrc32c.c:crc32c() we end up with a return sequence like: return %i7+8 lduw [%o5+16], %o0 ! MEM[(u32 *)__shash_desc.1_10 + 16B], %o5 holds the base of the on-stack area allocated for the shash descriptor. But the return released the stack frame and the register window. So if an intererupt arrives between 'return' and 'lduw', then the value read at %o5+16 can be corrupted. Add a data compiler barrier to work around this problem. This is exactly what the gcc fix will end up doing as well, and it absolutely should not change the code generated for other cpus (unless gcc on them has the same bug :-) With crucial insight from Eric Sandeen. Reported-by: Anatoly Pugachev Signed-off-by: David S. Miller Signed-off-by: Herbert Xu Signed-off-by: Greg Kroah-Hartman --- fs/btrfs/hash.c | 5 ++++- fs/f2fs/f2fs.h | 5 ++++- lib/libcrc32c.c | 6 ++++-- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/hash.c b/fs/btrfs/hash.c index aae520b2aee5..6fc94fcba072 100644 --- a/fs/btrfs/hash.c +++ b/fs/btrfs/hash.c @@ -33,6 +33,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = tfm; @@ -42,5 +43,7 @@ u32 btrfs_crc32c(u32 crc, const void *address, unsigned int length) err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index dd840f60e172..c1a0aef8efc6 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -1268,6 +1268,7 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, { SHASH_DESC_ON_STACK(shash, sbi->s_chksum_driver); u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 retval; int err; shash->tfm = sbi->s_chksum_driver; @@ -1277,7 +1278,9 @@ static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + retval = *ctx; + barrier_data(ctx); + return retval; } static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index acf9da449f81..2c573f08adb7 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -42,7 +42,7 @@ static struct crypto_shash *tfm; u32 crc32c(u32 crc, const void *address, unsigned int length) { SHASH_DESC_ON_STACK(shash, tfm); - u32 *ctx = (u32 *)shash_desc_ctx(shash); + u32 ret, *ctx = (u32 *)shash_desc_ctx(shash); int err; shash->tfm = tfm; @@ -52,7 +52,9 @@ u32 crc32c(u32 crc, const void *address, unsigned int length) err = crypto_shash_update(shash, address, length); BUG_ON(err); - return *ctx; + ret = *ctx; + barrier_data(ctx); + return ret; } EXPORT_SYMBOL(crc32c); From a717135c23fa60cfaf0c3768e8e72383237ce88d Mon Sep 17 00:00:00 2001 From: Jaegeuk Kim Date: Fri, 13 Oct 2017 10:27:45 -0700 Subject: [PATCH 516/546] FROMLIST: f2fs: expose some sectors to user in inline data or dentry case (from https://patchwork.kernel.org/patch/10005409/) If there's some data written through inline data or dentry, we need to shouw st_blocks. This fixes reporting zero blocks even though there is small written data. Bug: 67651285 Bug: 67600404 Change-Id: I9ad5cb137eb627b9fd22740d2ab98e0221433c95 Cc: stable@vger.kernel.org Signed-off-by: Jaegeuk Kim --- fs/f2fs/file.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 531379f513fa..a9e1655a6bf8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -665,6 +665,11 @@ int f2fs_getattr(struct vfsmount *mnt, { struct inode *inode = d_inode(dentry); generic_fillattr(inode, stat); + + /* we need to show initial sectors used for inline_data/dentries */ + if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) + stat->blocks += (stat->size + 511) >> 9; + return 0; } From e2e73a057dd0e0e0dc6d710a693c83672d1a0688 Mon Sep 17 00:00:00 2001 From: Jens Wiklander Date: Mon, 9 Oct 2017 11:11:49 +0200 Subject: [PATCH 517/546] optee: fix invalid of_node_put() in optee_driver_init() The first node supplied to of_find_matching_node() has its reference counter decreased as part of call to that function. In optee_driver_init() after calling of_find_matching_node() it's invalid to call of_node_put() on the supplied node again. So remove the invalid call to of_node_put(). Signed-off-by: Jens Wiklander --- drivers/tee/optee/core.c | 1 - 1 file changed, 1 deletion(-) diff --git a/drivers/tee/optee/core.c b/drivers/tee/optee/core.c index 58169e519422..18c8c0a50d37 100644 --- a/drivers/tee/optee/core.c +++ b/drivers/tee/optee/core.c @@ -589,7 +589,6 @@ static int __init optee_driver_init(void) return -ENODEV; np = of_find_matching_node(fw_np, optee_match); - of_node_put(fw_np); if (!np) return -ENODEV; From 6721969c7b8a35f85a41acaabdad190ed32cf704 Mon Sep 17 00:00:00 2001 From: Arend Van Spriel Date: Tue, 12 Sep 2017 10:47:53 +0200 Subject: [PATCH 518/546] brcmfmac: add length check in brcmf_cfg80211_escan_handler() commit 17df6453d4be17910456e99c5a85025aa1b7a246 upstream. Upon handling the firmware notification for scans the length was checked properly and may result in corrupting kernel heap memory due to buffer overruns. This fix addresses CVE-2017-0786. Cc: Kevin Cernekee Reviewed-by: Hante Meuleman Reviewed-by: Pieter-Paul Giesberts Reviewed-by: Franky Lin Signed-off-by: Arend van Spriel Signed-off-by: Kalle Valo Signed-off-by: Greg Kroah-Hartman --- .../net/wireless/brcm80211/brcmfmac/cfg80211.c | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c index f18491cf793c..5fecae0ba52e 100644 --- a/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c +++ b/drivers/net/wireless/brcm80211/brcmfmac/cfg80211.c @@ -2903,6 +2903,7 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, struct brcmf_cfg80211_info *cfg = ifp->drvr->config; s32 status; struct brcmf_escan_result_le *escan_result_le; + u32 escan_buflen; struct brcmf_bss_info_le *bss_info_le; struct brcmf_bss_info_le *bss = NULL; u32 bi_length; @@ -2919,11 +2920,23 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, if (status == BRCMF_E_STATUS_PARTIAL) { brcmf_dbg(SCAN, "ESCAN Partial result\n"); + if (e->datalen < sizeof(*escan_result_le)) { + brcmf_err("invalid event data length\n"); + goto exit; + } escan_result_le = (struct brcmf_escan_result_le *) data; if (!escan_result_le) { brcmf_err("Invalid escan result (NULL pointer)\n"); goto exit; } + escan_buflen = le32_to_cpu(escan_result_le->buflen); + if (escan_buflen > WL_ESCAN_BUF_SIZE || + escan_buflen > e->datalen || + escan_buflen < sizeof(*escan_result_le)) { + brcmf_err("Invalid escan buffer length: %d\n", + escan_buflen); + goto exit; + } if (le16_to_cpu(escan_result_le->bss_count) != 1) { brcmf_err("Invalid bss_count %d: ignoring\n", escan_result_le->bss_count); @@ -2940,9 +2953,8 @@ brcmf_cfg80211_escan_handler(struct brcmf_if *ifp, } bi_length = le32_to_cpu(bss_info_le->length); - if (bi_length != (le32_to_cpu(escan_result_le->buflen) - - WL_ESCAN_RESULTS_FIXED_SIZE)) { - brcmf_err("Invalid bss_info length %d: ignoring\n", + if (bi_length != escan_buflen - WL_ESCAN_RESULTS_FIXED_SIZE) { + brcmf_err("Ignoring invalid bss_info length: %d\n", bi_length); goto exit; } From bd36826958de7cc70a9e7d83c7a6f4916fb863a2 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Thu, 24 Aug 2017 13:22:06 -0400 Subject: [PATCH 519/546] ext4: in ext4_seek_{hole,data}, return -ENXIO for negative offsets commit 1bd8d6cd3e413d64e543ec3e69ff43e75a1cf1ea upstream. In the ext4 implementations of SEEK_HOLE and SEEK_DATA, make sure we return -ENXIO for negative offsets instead of banging around inside the extent code and returning -EFSCORRUPTED. Reported-by: Mateusz S Signed-off-by: Darrick J. Wong Signed-off-by: Theodore Ts'o Signed-off-by: Greg Kroah-Hartman --- fs/ext4/file.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 45ef9975caec..a8b1749d79a8 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -559,7 +559,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) mutex_lock(&inode->i_mutex); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { mutex_unlock(&inode->i_mutex); return -ENXIO; } @@ -632,7 +632,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) mutex_lock(&inode->i_mutex); isize = i_size_read(inode); - if (offset >= isize) { + if (offset < 0 || offset >= isize) { mutex_unlock(&inode->i_mutex); return -ENXIO; } From f2bb4bcc041194d655e41a4fb24789f3966755c3 Mon Sep 17 00:00:00 2001 From: Pavel Shilovsky Date: Sat, 8 Jul 2017 14:32:00 -0700 Subject: [PATCH 520/546] CIFS: Reconnect expired SMB sessions commit 511c54a2f69195b28afb9dd119f03787b1625bb4 upstream. According to the MS-SMB2 spec (3.2.5.1.6) once the client receives STATUS_NETWORK_SESSION_EXPIRED error code from a server it should reconnect the current SMB session. Currently the client doesn't do that. This can result in subsequent client requests failing by the server. The patch adds an additional logic to the demultiplex thread to identify expired sessions and reconnect them. Signed-off-by: Pavel Shilovsky Signed-off-by: Steve French Signed-off-by: Greg Kroah-Hartman --- fs/cifs/cifsglob.h | 2 ++ fs/cifs/cifssmb.c | 7 +++++++ fs/cifs/connect.c | 7 +++++++ fs/cifs/smb2ops.c | 16 ++++++++++++++++ 4 files changed, 32 insertions(+) diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index e2f6a79e9b01..8225de3c9743 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -351,6 +351,8 @@ struct smb_version_operations { unsigned int (*calc_smb_size)(void *); /* check for STATUS_PENDING and process it in a positive case */ bool (*is_status_pending)(char *, struct TCP_Server_Info *, int); + /* check for STATUS_NETWORK_SESSION_EXPIRED */ + bool (*is_session_expired)(char *); /* send oplock break response */ int (*oplock_response)(struct cifs_tcon *, struct cifs_fid *, struct cifsInodeInfo *); diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index b60150e5b5ce..0c92af11f4f4 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -1460,6 +1460,13 @@ cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid) return length; server->total_read += length; + if (server->ops->is_session_expired && + server->ops->is_session_expired(buf)) { + cifs_reconnect(server); + wake_up(&server->response_q); + return -1; + } + if (server->ops->is_status_pending && server->ops->is_status_pending(buf, server, 0)) { discard_remaining_data(server); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index b377aa8f266f..0a2bf9462637 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -850,6 +850,13 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); + if (server->ops->is_session_expired && + server->ops->is_session_expired(buf)) { + cifs_reconnect(server); + wake_up(&server->response_q); + return -1; + } + if (server->ops->is_status_pending && server->ops->is_status_pending(buf, server, length)) return -1; diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 1d125d3d0d89..e6b1795fbf2a 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -963,6 +963,18 @@ smb2_is_status_pending(char *buf, struct TCP_Server_Info *server, int length) return true; } +static bool +smb2_is_session_expired(char *buf) +{ + struct smb2_hdr *hdr = (struct smb2_hdr *)buf; + + if (hdr->Status != STATUS_NETWORK_SESSION_EXPIRED) + return false; + + cifs_dbg(FYI, "Session expired\n"); + return true; +} + static int smb2_oplock_response(struct cifs_tcon *tcon, struct cifs_fid *fid, struct cifsInodeInfo *cinode) @@ -1552,6 +1564,7 @@ struct smb_version_operations smb20_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, @@ -1633,6 +1646,7 @@ struct smb_version_operations smb21_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, @@ -1715,6 +1729,7 @@ struct smb_version_operations smb30_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, @@ -1803,6 +1818,7 @@ struct smb_version_operations smb311_operations = { .close_dir = smb2_close_dir, .calc_smb_size = smb2_calc_size, .is_status_pending = smb2_is_status_pending, + .is_session_expired = smb2_is_session_expired, .oplock_response = smb2_oplock_response, .queryfs = smb2_queryfs, .mand_lock = smb2_mand_lock, From 6a6c61d8467d2dd7059b7d52773c18f8122e4f68 Mon Sep 17 00:00:00 2001 From: Peng Xu Date: Tue, 3 Oct 2017 23:21:51 +0300 Subject: [PATCH 521/546] nl80211: Define policy for packet pattern attributes commit ad670233c9e1d5feb365d870e30083ef1b889177 upstream. Define a policy for packet pattern attributes in order to fix a potential read over the end of the buffer during nla_get_u32() of the NL80211_PKTPAT_OFFSET attribute. Note that the data there can always be read due to SKB allocation (with alignment and struct skb_shared_info at the end), but the data might be uninitialized. This could be used to leak some data from uninitialized vmalloc() memory, but most drivers don't allow an offset (so you'd just get -EINVAL if the data is non-zero) or just allow it with a fixed value - 100 or 128 bytes, so anything above that would get -EINVAL. With brcmfmac the limit is 1500 so (at least) one byte could be obtained. Cc: stable@kernel.org Signed-off-by: Peng Xu Signed-off-by: Jouni Malinen [rewrite description based on SKB allocation knowledge] Signed-off-by: Johannes Berg Signed-off-by: Greg Kroah-Hartman --- net/wireless/nl80211.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c index 8ece212aa3d2..7950506395a8 100644 --- a/net/wireless/nl80211.c +++ b/net/wireless/nl80211.c @@ -485,6 +485,14 @@ nl80211_plan_policy[NL80211_SCHED_SCAN_PLAN_MAX + 1] = { [NL80211_SCHED_SCAN_PLAN_ITERATIONS] = { .type = NLA_U32 }, }; +/* policy for packet pattern attributes */ +static const struct nla_policy +nl80211_packet_pattern_policy[MAX_NL80211_PKTPAT + 1] = { + [NL80211_PKTPAT_MASK] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_PATTERN] = { .type = NLA_BINARY, }, + [NL80211_PKTPAT_OFFSET] = { .type = NLA_U32 }, +}; + static int nl80211_prepare_wdev_dump(struct sk_buff *skb, struct netlink_callback *cb, struct cfg80211_registered_device **rdev, @@ -9410,7 +9418,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info) u8 *mask_pat; nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), - nla_len(pat), NULL); + nla_len(pat), nl80211_packet_pattern_policy); err = -EINVAL; if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) @@ -9660,7 +9668,7 @@ static int nl80211_parse_coalesce_rule(struct cfg80211_registered_device *rdev, u8 *mask_pat; nla_parse(pat_tb, MAX_NL80211_PKTPAT, nla_data(pat), - nla_len(pat), NULL); + nla_len(pat), nl80211_packet_pattern_policy); if (!pat_tb[NL80211_PKTPAT_MASK] || !pat_tb[NL80211_PKTPAT_PATTERN]) return -EINVAL; From 45bd4e40804009d0f312ce8640d1f6a1aa306aa4 Mon Sep 17 00:00:00 2001 From: Luca Coelho Date: Fri, 1 Sep 2017 17:59:15 +0300 Subject: [PATCH 522/546] iwlwifi: mvm: use IWL_HCMD_NOCOPY for MCAST_FILTER_CMD commit 97bce57bd7f96e1218751996f549a6e61f18cc8c upstream. The MCAST_FILTER_CMD can get quite large when we have many mcast addresses to set (we support up to 255). So the command should be send as NOCOPY to prevent a warning caused by too-long commands: WARNING: CPU: 0 PID: 9700 at /root/iwlwifi/stack-dev/drivers/net/wireless/intel/iwlwifi/pcie/tx.c:1550 iwl_pcie_enqueue_hcmd+0x8c7/0xb40 [iwlwifi] Command MCAST_FILTER_CMD (0x1d0) is too large (328 bytes) This fixes: https://bugzilla.kernel.org/show_bug.cgi?id=196743 Signed-off-by: Luca Coelho Signed-off-by: Greg Kroah-Hartman --- drivers/net/wireless/iwlwifi/mvm/mac80211.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/iwlwifi/mvm/mac80211.c b/drivers/net/wireless/iwlwifi/mvm/mac80211.c index 1a8ea775de08..984cd2f05c4a 100644 --- a/drivers/net/wireless/iwlwifi/mvm/mac80211.c +++ b/drivers/net/wireless/iwlwifi/mvm/mac80211.c @@ -1906,6 +1906,11 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, struct iwl_mvm_mc_iter_data *data = _data; struct iwl_mvm *mvm = data->mvm; struct iwl_mcast_filter_cmd *cmd = mvm->mcast_filter_cmd; + struct iwl_host_cmd hcmd = { + .id = MCAST_FILTER_CMD, + .flags = CMD_ASYNC, + .dataflags[0] = IWL_HCMD_DFL_NOCOPY, + }; int ret, len; /* if we don't have free ports, mcast frames will be dropped */ @@ -1920,7 +1925,10 @@ static void iwl_mvm_mc_iface_iterator(void *_data, u8 *mac, memcpy(cmd->bssid, vif->bss_conf.bssid, ETH_ALEN); len = roundup(sizeof(*cmd) + cmd->count * ETH_ALEN, 4); - ret = iwl_mvm_send_cmd_pdu(mvm, MCAST_FILTER_CMD, CMD_ASYNC, len, cmd); + hcmd.len[0] = len; + hcmd.data[0] = cmd; + + ret = iwl_mvm_send_cmd(mvm, &hcmd); if (ret) IWL_ERR(mvm, "mcast filter cmd error. ret=%d\n", ret); } From 5fd45516595ac995b859ab817031bb004542d446 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 22 Sep 2017 14:10:22 -0700 Subject: [PATCH 523/546] rcu: Allow for page faults in NMI handlers commit 28585a832602747cbfa88ad8934013177a3aae38 upstream. A number of architecture invoke rcu_irq_enter() on exception entry in order to allow RCU read-side critical sections in the exception handler when the exception is from an idle or nohz_full CPU. This works, at least unless the exception happens in an NMI handler. In that case, rcu_nmi_enter() would already have exited the extended quiescent state, which would mean that rcu_irq_enter() would (incorrectly) cause RCU to think that it is again in an extended quiescent state. This will in turn result in lockdep splats in response to later RCU read-side critical sections. This commit therefore causes rcu_irq_enter() and rcu_irq_exit() to take no action if there is an rcu_nmi_enter() in effect, thus avoiding the unscheduled return to RCU quiescent state. This in turn should make the kernel safe for on-demand RCU voyeurism. Link: http://lkml.kernel.org/r/20170922211022.GA18084@linux.vnet.ibm.com Cc: stable@vger.kernel.org Fixes: 0be964be0 ("module: Sanitize RCU usage and locking") Reported-by: Steven Rostedt Signed-off-by: Paul E. McKenney Signed-off-by: Steven Rostedt (VMware) Signed-off-by: Greg Kroah-Hartman --- kernel/rcu/tree.c | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f07343b54fe5..8a62cbfe1f2f 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -759,6 +759,12 @@ void rcu_irq_exit(void) local_irq_save(flags); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_exit() invoked with irqs enabled!!!"); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting--; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && @@ -887,6 +893,12 @@ void rcu_irq_enter(void) local_irq_save(flags); rdtp = this_cpu_ptr(&rcu_dynticks); + + /* Page faults can happen in NMI handlers, so check... */ + if (READ_ONCE(rdtp->dynticks_nmi_nesting)) + return; + + RCU_LOCKDEP_WARN(!irqs_disabled(), "rcu_irq_enter() invoked with irqs enabled!!!"); oldval = rdtp->dynticks_nesting; rdtp->dynticks_nesting++; WARN_ON_ONCE(IS_ENABLED(CONFIG_RCU_EQS_DEBUG) && From 2fff3c5c2be733b16674efe9522d824dfc39038f Mon Sep 17 00:00:00 2001 From: Alan Stern Date: Fri, 6 Oct 2017 10:27:44 -0400 Subject: [PATCH 524/546] USB: dummy-hcd: Fix deadlock caused by disconnect detection commit ab219221a5064abfff9f78c323c4a257b16cdb81 upstream. The dummy-hcd driver calls the gadget driver's disconnect callback under the wrong conditions. It should invoke the callback when Vbus power is turned off, but instead it does so when the D+ pullup is turned off. This can cause a deadlock in the composite core when a gadget driver is unregistered: [ 88.361471] ============================================ [ 88.362014] WARNING: possible recursive locking detected [ 88.362580] 4.14.0-rc2+ #9 Not tainted [ 88.363010] -------------------------------------------- [ 88.363561] v4l_id/526 is trying to acquire lock: [ 88.364062] (&(&cdev->lock)->rlock){....}, at: [] composite_disconnect+0x43/0x100 [libcomposite] [ 88.365051] [ 88.365051] but task is already holding lock: [ 88.365826] (&(&cdev->lock)->rlock){....}, at: [] usb_function_deactivate+0x29/0x80 [libcomposite] [ 88.366858] [ 88.366858] other info that might help us debug this: [ 88.368301] Possible unsafe locking scenario: [ 88.368301] [ 88.369304] CPU0 [ 88.369701] ---- [ 88.370101] lock(&(&cdev->lock)->rlock); [ 88.370623] lock(&(&cdev->lock)->rlock); [ 88.371145] [ 88.371145] *** DEADLOCK *** [ 88.371145] [ 88.372211] May be due to missing lock nesting notation [ 88.372211] [ 88.373191] 2 locks held by v4l_id/526: [ 88.373715] #0: (&(&cdev->lock)->rlock){....}, at: [] usb_function_deactivate+0x29/0x80 [libcomposite] [ 88.374814] #1: (&(&dum_hcd->dum->lock)->rlock){....}, at: [] dummy_pullup+0x7d/0xf0 [dummy_hcd] [ 88.376289] [ 88.376289] stack backtrace: [ 88.377726] CPU: 0 PID: 526 Comm: v4l_id Not tainted 4.14.0-rc2+ #9 [ 88.378557] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 [ 88.379504] Call Trace: [ 88.380019] dump_stack+0x86/0xc7 [ 88.380605] __lock_acquire+0x841/0x1120 [ 88.381252] lock_acquire+0xd5/0x1c0 [ 88.381865] ? composite_disconnect+0x43/0x100 [libcomposite] [ 88.382668] _raw_spin_lock_irqsave+0x40/0x54 [ 88.383357] ? composite_disconnect+0x43/0x100 [libcomposite] [ 88.384290] composite_disconnect+0x43/0x100 [libcomposite] [ 88.385490] set_link_state+0x2d4/0x3c0 [dummy_hcd] [ 88.386436] dummy_pullup+0xa7/0xf0 [dummy_hcd] [ 88.387195] usb_gadget_disconnect+0xd8/0x160 [udc_core] [ 88.387990] usb_gadget_deactivate+0xd3/0x160 [udc_core] [ 88.388793] usb_function_deactivate+0x64/0x80 [libcomposite] [ 88.389628] uvc_function_disconnect+0x1e/0x40 [usb_f_uvc] This patch changes the code to test the port-power status bit rather than the port-connect status bit when deciding whether to isue the callback. Signed-off-by: Alan Stern Reported-by: David Tulloh Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/udc/dummy_hcd.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/drivers/usb/gadget/udc/dummy_hcd.c b/drivers/usb/gadget/udc/dummy_hcd.c index db645c38055d..8080a11947b7 100644 --- a/drivers/usb/gadget/udc/dummy_hcd.c +++ b/drivers/usb/gadget/udc/dummy_hcd.c @@ -420,6 +420,7 @@ static void set_link_state_by_speed(struct dummy_hcd *dum_hcd) static void set_link_state(struct dummy_hcd *dum_hcd) { struct dummy *dum = dum_hcd->dum; + unsigned int power_bit; dum_hcd->active = 0; if (dum->pullup) @@ -430,17 +431,19 @@ static void set_link_state(struct dummy_hcd *dum_hcd) return; set_link_state_by_speed(dum_hcd); + power_bit = (dummy_hcd_to_hcd(dum_hcd)->speed == HCD_USB3 ? + USB_SS_PORT_STAT_POWER : USB_PORT_STAT_POWER); if ((dum_hcd->port_status & USB_PORT_STAT_ENABLE) == 0 || dum_hcd->active) dum_hcd->resuming = 0; /* Currently !connected or in reset */ - if ((dum_hcd->port_status & USB_PORT_STAT_CONNECTION) == 0 || + if ((dum_hcd->port_status & power_bit) == 0 || (dum_hcd->port_status & USB_PORT_STAT_RESET) != 0) { - unsigned disconnect = USB_PORT_STAT_CONNECTION & + unsigned int disconnect = power_bit & dum_hcd->old_status & (~dum_hcd->port_status); - unsigned reset = USB_PORT_STAT_RESET & + unsigned int reset = USB_PORT_STAT_RESET & (~dum_hcd->old_status) & dum_hcd->port_status; /* Report reset and disconnect events to the driver */ From 29b202ebf5991e7ed055ed186e79ca6a4ab8e25b Mon Sep 17 00:00:00 2001 From: Paul Burton Date: Fri, 8 Sep 2017 15:12:21 -0700 Subject: [PATCH 525/546] MIPS: math-emu: Remove pr_err() calls from fpu_emu() commit ca8eb05b5f332a9e1ab3e2ece498d49f4d683470 upstream. The FPU emulator includes 2 calls to pr_err() which are triggered by invalid instruction encodings for MIPSr6 cmp.cond.fmt instructions. These cases are not kernel errors, merely invalid instructions which are already handled by delivering a SIGILL which will provide notification that something failed in cases where that makes sense. In cases where that SIGILL is somewhat expected & being handled, for example when crashme happens to generate one of the affected bad encodings, the message is printed with no useful context about what triggered it & spams the kernel log for no good reason. Remove the pr_err() calls to make crashme run silently & treat the bad encodings the same way we do others, with a SIGILL & no further kernel log output. Signed-off-by: Paul Burton Fixes: f8c3c6717a71 ("MIPS: math-emu: Add support for the CMP.condn.fmt R6 instruction") Cc: linux-mips@linux-mips.org Patchwork: https://patchwork.linux-mips.org/patch/17253/ Signed-off-by: Ralf Baechle Signed-off-by: Greg Kroah-Hartman --- arch/mips/math-emu/cp1emu.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/arch/mips/math-emu/cp1emu.c b/arch/mips/math-emu/cp1emu.c index 6da2e4a6ba39..dd058aa8a3b5 100644 --- a/arch/mips/math-emu/cp1emu.c +++ b/arch/mips/math-emu/cp1emu.c @@ -2360,7 +2360,6 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, break; default: /* Reserved R6 ops */ - pr_err("Reserved MIPS R6 CMP.condn.S operation\n"); return SIGILL; } } @@ -2434,7 +2433,6 @@ static int fpu_emu(struct pt_regs *xcp, struct mips_fpu_struct *ctx, break; default: /* Reserved R6 ops */ - pr_err("Reserved MIPS R6 CMP.condn.D operation\n"); return SIGILL; } } From e7485f0f6a7b984c05c65b6e56fde45bb65089a7 Mon Sep 17 00:00:00 2001 From: Peter Ujfalusi Date: Mon, 18 Sep 2017 11:16:26 +0300 Subject: [PATCH 526/546] dmaengine: edma: Align the memcpy acnt array size with the transfer commit 87a2f622cc6446c7d09ac655b7b9b04886f16a4c upstream. Memory to Memory transfers does not have any special alignment needs regarding to acnt array size, but if one of the areas are in memory mapped regions (like PCIe memory), we need to make sure that the acnt array size is aligned with the mem copy parameters. Before "dmaengine: edma: Optimize memcpy operation" change the memcpy was set up in a different way: acnt == number of bytes in a word based on __ffs((src | dest | len), bcnt and ccnt for looping the necessary number of words to comlete the trasnfer. Instead of reverting the commit we can fix it to make sure that the ACNT size is aligned to the traswnfer. Fixes: df6694f80365a (dmaengine: edma: Optimize memcpy operation) Signed-off-by: Peter Ujfalusi Signed-off-by: Vinod Koul Signed-off-by: Greg Kroah-Hartman --- drivers/dma/edma.c | 19 ++++++++++++++++--- 1 file changed, 16 insertions(+), 3 deletions(-) diff --git a/drivers/dma/edma.c b/drivers/dma/edma.c index 16fe773fb846..85674a8d0436 100644 --- a/drivers/dma/edma.c +++ b/drivers/dma/edma.c @@ -1126,11 +1126,24 @@ static struct dma_async_tx_descriptor *edma_prep_dma_memcpy( struct edma_desc *edesc; struct device *dev = chan->device->dev; struct edma_chan *echan = to_edma_chan(chan); - unsigned int width, pset_len; + unsigned int width, pset_len, array_size; if (unlikely(!echan || !len)) return NULL; + /* Align the array size (acnt block) with the transfer properties */ + switch (__ffs((src | dest | len))) { + case 0: + array_size = SZ_32K - 1; + break; + case 1: + array_size = SZ_32K - 2; + break; + default: + array_size = SZ_32K - 4; + break; + } + if (len < SZ_64K) { /* * Transfer size less than 64K can be handled with one paRAM @@ -1152,7 +1165,7 @@ static struct dma_async_tx_descriptor *edma_prep_dma_memcpy( * When the full_length is multibple of 32767 one slot can be * used to complete the transfer. */ - width = SZ_32K - 1; + width = array_size; pset_len = rounddown(len, width); /* One slot is enough for lengths multiple of (SZ_32K -1) */ if (unlikely(pset_len == len)) @@ -1202,7 +1215,7 @@ static struct dma_async_tx_descriptor *edma_prep_dma_memcpy( } dest += pset_len; src += pset_len; - pset_len = width = len % (SZ_32K - 1); + pset_len = width = len % array_size; ret = edma_config_pset(chan, &edesc->pset[1], src, dest, 1, width, pset_len, DMA_MEM_TO_MEM); From 2929cb995378205bceda86d6fd3cbc22e522f97f Mon Sep 17 00:00:00 2001 From: Jaejoong Kim Date: Thu, 28 Sep 2017 19:16:30 +0900 Subject: [PATCH 527/546] HID: usbhid: fix out-of-bounds bug commit f043bfc98c193c284e2cd768fefabe18ac2fed9b upstream. The hid descriptor identifies the length and type of subordinate descriptors for a device. If the received hid descriptor is smaller than the size of the struct hid_descriptor, it is possible to cause out-of-bounds. In addition, if bNumDescriptors of the hid descriptor have an incorrect value, this can also cause out-of-bounds while approaching hdesc->desc[n]. So check the size of hid descriptor and bNumDescriptors. BUG: KASAN: slab-out-of-bounds in usbhid_parse+0x9b1/0xa20 Read of size 1 at addr ffff88006c5f8edf by task kworker/1:2/1261 CPU: 1 PID: 1261 Comm: kworker/1:2 Not tainted 4.14.0-rc1-42251-gebb2c2437d80 #169 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Bochs 01/01/2011 Workqueue: usb_hub_wq hub_event Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x22f/0x340 mm/kasan/report.c:409 __asan_report_load1_noabort+0x19/0x20 mm/kasan/report.c:427 usbhid_parse+0x9b1/0xa20 drivers/hid/usbhid/hid-core.c:1004 hid_add_device+0x16b/0xb30 drivers/hid/hid-core.c:2944 usbhid_probe+0xc28/0x1100 drivers/hid/usbhid/hid-core.c:1369 usb_probe_interface+0x35d/0x8e0 drivers/usb/core/driver.c:361 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_set_configuration+0x104e/0x1870 drivers/usb/core/message.c:1932 generic_probe+0x73/0xe0 drivers/usb/core/generic.c:174 usb_probe_device+0xaf/0xe0 drivers/usb/core/driver.c:266 really_probe drivers/base/dd.c:413 driver_probe_device+0x610/0xa00 drivers/base/dd.c:557 __device_attach_driver+0x230/0x290 drivers/base/dd.c:653 bus_for_each_drv+0x161/0x210 drivers/base/bus.c:463 __device_attach+0x26e/0x3d0 drivers/base/dd.c:710 device_initial_probe+0x1f/0x30 drivers/base/dd.c:757 bus_probe_device+0x1eb/0x290 drivers/base/bus.c:523 device_add+0xd0b/0x1660 drivers/base/core.c:1835 usb_new_device+0x7b8/0x1020 drivers/usb/core/hub.c:2457 hub_port_connect drivers/usb/core/hub.c:4903 hub_port_connect_change drivers/usb/core/hub.c:5009 port_event drivers/usb/core/hub.c:5115 hub_event+0x194d/0x3740 drivers/usb/core/hub.c:5195 process_one_work+0xc7f/0x1db0 kernel/workqueue.c:2119 worker_thread+0x221/0x1850 kernel/workqueue.c:2253 kthread+0x3a1/0x470 kernel/kthread.c:231 ret_from_fork+0x2a/0x40 arch/x86/entry/entry_64.S:431 Reported-by: Andrey Konovalov Signed-off-by: Jaejoong Kim Tested-by: Andrey Konovalov Acked-by: Alan Stern Signed-off-by: Jiri Kosina Signed-off-by: Greg Kroah-Hartman --- drivers/hid/usbhid/hid-core.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/hid/usbhid/hid-core.c b/drivers/hid/usbhid/hid-core.c index 0df32fe0e345..b0eeb5090c91 100644 --- a/drivers/hid/usbhid/hid-core.c +++ b/drivers/hid/usbhid/hid-core.c @@ -971,6 +971,8 @@ static int usbhid_parse(struct hid_device *hid) unsigned int rsize = 0; char *rdesc; int ret, n; + int num_descriptors; + size_t offset = offsetof(struct hid_descriptor, desc); quirks = usbhid_lookup_quirk(le16_to_cpu(dev->descriptor.idVendor), le16_to_cpu(dev->descriptor.idProduct)); @@ -993,10 +995,18 @@ static int usbhid_parse(struct hid_device *hid) return -ENODEV; } + if (hdesc->bLength < sizeof(struct hid_descriptor)) { + dbg_hid("hid descriptor is too short\n"); + return -EINVAL; + } + hid->version = le16_to_cpu(hdesc->bcdHID); hid->country = hdesc->bCountryCode; - for (n = 0; n < hdesc->bNumDescriptors; n++) + num_descriptors = min_t(int, hdesc->bNumDescriptors, + (hdesc->bLength - offset) / sizeof(struct hid_class_descriptor)); + + for (n = 0; n < num_descriptors; n++) if (hdesc->desc[n].bDescriptorType == HID_DT_REPORT) rsize = le16_to_cpu(hdesc->desc[n].wDescriptorLength); From 03bd90fc82e4a0b39455de5a9a7a75a7a9a7e662 Mon Sep 17 00:00:00 2001 From: Herbert Xu Date: Mon, 9 Oct 2017 23:30:02 +0800 Subject: [PATCH 528/546] crypto: shash - Fix zero-length shash ahash digest crash MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit commit b61907bb42409adf9b3120f741af7c57dd7e3db2 upstream. The shash ahash digest adaptor function may crash if given a zero-length input together with a null SG list. This is because it tries to read the SG list before looking at the length. This patch fixes it by checking the length first. Reported-by: Stephan Müller Signed-off-by: Herbert Xu Tested-by: Stephan Müller Signed-off-by: Greg Kroah-Hartman --- crypto/shash.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/crypto/shash.c b/crypto/shash.c index 359754591653..b2cd109d9171 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -274,12 +274,14 @@ static int shash_async_finup(struct ahash_request *req) int shash_ahash_digest(struct ahash_request *req, struct shash_desc *desc) { - struct scatterlist *sg = req->src; - unsigned int offset = sg->offset; unsigned int nbytes = req->nbytes; + struct scatterlist *sg; + unsigned int offset; int err; - if (nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset)) { + if (nbytes && + (sg = req->src, offset = sg->offset, + nbytes < min(sg->length, ((unsigned int)(PAGE_SIZE)) - offset))) { void *data; data = kmap_atomic(sg_page(sg)); From 6a92b9997028a1b223da940c2527fd8edccfceb5 Mon Sep 17 00:00:00 2001 From: Haozhong Zhang Date: Tue, 10 Oct 2017 15:01:22 +0800 Subject: [PATCH 529/546] KVM: nVMX: fix guest CR4 loading when emulating L2 to L1 exit commit 8eb3f87d903168bdbd1222776a6b1e281f50513e upstream. When KVM emulates an exit from L2 to L1, it loads L1 CR4 into the guest CR4. Before this CR4 loading, the guest CR4 refers to L2 CR4. Because these two CR4's are in different levels of guest, we should vmx_set_cr4() rather than kvm_set_cr4() here. The latter, which is used to handle guest writes to its CR4, checks the guest change to CR4 and may fail if the change is invalid. The failure may cause trouble. Consider we start a L1 guest with non-zero L1 PCID in use, (i.e. L1 CR4.PCIDE == 1 && L1 CR3.PCID != 0) and a L2 guest with L2 PCID disabled, (i.e. L2 CR4.PCIDE == 0) and following events may happen: 1. If kvm_set_cr4() is used in load_vmcs12_host_state() to load L1 CR4 into guest CR4 (in VMCS01) for L2 to L1 exit, it will fail because of PCID check. As a result, the guest CR4 recorded in L0 KVM (i.e. vcpu->arch.cr4) is left to the value of L2 CR4. 2. Later, if L1 attempts to change its CR4, e.g., clearing VMXE bit, kvm_set_cr4() in L0 KVM will think L1 also wants to enable PCID, because the wrong L2 CR4 is used by L0 KVM as L1 CR4. As L1 CR3.PCID != 0, L0 KVM will inject GP to L1 guest. Fixes: 4704d0befb072 ("KVM: nVMX: Exiting from L2 to L1") Cc: qemu-stable@nongnu.org Signed-off-by: Haozhong Zhang Signed-off-by: Paolo Bonzini Signed-off-by: Greg Kroah-Hartman --- arch/x86/kvm/vmx.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index a018dff00808..9114588e3e61 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -10369,7 +10369,7 @@ static void load_vmcs12_host_state(struct kvm_vcpu *vcpu, * (KVM doesn't change it)- no reason to call set_cr4_guest_host_mask(); */ vcpu->arch.cr4_guest_owned_bits = ~vmcs_readl(CR4_GUEST_HOST_MASK); - kvm_set_cr4(vcpu, vmcs12->host_cr4); + vmx_set_cr4(vcpu, vmcs12->host_cr4); nested_ept_uninit_mmu_context(vcpu); From eb7f31673bbc6a87670139a856d8b950be0820c6 Mon Sep 17 00:00:00 2001 From: Kazuya Mizuguchi Date: Mon, 2 Oct 2017 14:01:41 +0900 Subject: [PATCH 530/546] usb: renesas_usbhs: Fix DMAC sequence for receiving zero-length packet commit 29c7f3e68eec4ae94d85ad7b5dfdafdb8089f513 upstream. The DREQE bit of the DnFIFOSEL should be set to 1 after the DE bit of USB-DMAC on R-Car SoCs is set to 1 after the USB-DMAC received a zero-length packet. Otherwise, a transfer completion interruption of USB-DMAC doesn't happen. Even if the driver changes the sequence, normal operations (transmit/receive without zero-length packet) will not cause any side-effects. So, this patch fixes the sequence anyway. Signed-off-by: Kazuya Mizuguchi [shimoda: revise the commit log] Fixes: e73a9891b3a1 ("usb: renesas_usbhs: add DMAEngine support") Signed-off-by: Yoshihiro Shimoda Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/renesas_usbhs/fifo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/usb/renesas_usbhs/fifo.c b/drivers/usb/renesas_usbhs/fifo.c index d95cd1a72b66..8bb9367ada45 100644 --- a/drivers/usb/renesas_usbhs/fifo.c +++ b/drivers/usb/renesas_usbhs/fifo.c @@ -858,9 +858,9 @@ static void xfer_work(struct work_struct *work) fifo->name, usbhs_pipe_number(pipe), pkt->length, pkt->zero); usbhs_pipe_running(pipe, 1); - usbhsf_dma_start(pipe, fifo); usbhs_pipe_set_trans_count_if_bulk(pipe, pkt->trans); dma_async_issue_pending(chan); + usbhsf_dma_start(pipe, fifo); usbhs_pipe_enable(pipe); xfer_work_end: From 050c4bbc09f17cc60363dabdda1c31aa36a66f1d Mon Sep 17 00:00:00 2001 From: Joerg Roedel Date: Fri, 13 Oct 2017 14:32:37 +0200 Subject: [PATCH 531/546] iommu/amd: Finish TLB flush in amd_iommu_unmap() commit ce76353f169a6471542d999baf3d29b121dce9c0 upstream. The function only sends the flush command to the IOMMU(s), but does not wait for its completion when it returns. Fix that. Fixes: 601367d76bd1 ('x86/amd-iommu: Remove iommu_flush_domain function') Signed-off-by: Joerg Roedel Signed-off-by: Greg Kroah-Hartman --- drivers/iommu/amd_iommu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/iommu/amd_iommu.c b/drivers/iommu/amd_iommu.c index a0ef57483ebb..52c36394dba5 100644 --- a/drivers/iommu/amd_iommu.c +++ b/drivers/iommu/amd_iommu.c @@ -3096,6 +3096,7 @@ static size_t amd_iommu_unmap(struct iommu_domain *dom, unsigned long iova, mutex_unlock(&domain->api_lock); domain_flush_tlb_pde(domain); + domain_flush_complete(domain); return unmap_size; } From dc7c3bd09c7d2063c4d1be23d72ee85f1b3bb947 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Tue, 10 Oct 2017 14:10:32 +0200 Subject: [PATCH 532/546] ALSA: usb-audio: Kill stray URB at exiting commit 124751d5e63c823092060074bd0abaae61aaa9c4 upstream. USB-audio driver may leave a stray URB for the mixer interrupt when it exits by some error during probe. This leads to a use-after-free error as spotted by syzkaller like: ================================================================== BUG: KASAN: use-after-free in snd_usb_mixer_interrupt+0x604/0x6f0 Call Trace: __dump_stack lib/dump_stack.c:16 dump_stack+0x292/0x395 lib/dump_stack.c:52 print_address_description+0x78/0x280 mm/kasan/report.c:252 kasan_report_error mm/kasan/report.c:351 kasan_report+0x23d/0x350 mm/kasan/report.c:409 __asan_report_load8_noabort+0x19/0x20 mm/kasan/report.c:430 snd_usb_mixer_interrupt+0x604/0x6f0 sound/usb/mixer.c:2490 __usb_hcd_giveback_urb+0x2e0/0x650 drivers/usb/core/hcd.c:1779 .... Allocated by task 1484: save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 kasan_kmalloc+0xad/0xe0 mm/kasan/kasan.c:551 kmem_cache_alloc_trace+0x11e/0x2d0 mm/slub.c:2772 kmalloc ./include/linux/slab.h:493 kzalloc ./include/linux/slab.h:666 snd_usb_create_mixer+0x145/0x1010 sound/usb/mixer.c:2540 create_standard_mixer_quirk+0x58/0x80 sound/usb/quirks.c:516 snd_usb_create_quirk+0x92/0x100 sound/usb/quirks.c:560 create_composite_quirk+0x1c4/0x3e0 sound/usb/quirks.c:59 snd_usb_create_quirk+0x92/0x100 sound/usb/quirks.c:560 usb_audio_probe+0x1040/0x2c10 sound/usb/card.c:618 .... Freed by task 1484: save_stack_trace+0x1b/0x20 arch/x86/kernel/stacktrace.c:59 save_stack+0x43/0xd0 mm/kasan/kasan.c:447 set_track mm/kasan/kasan.c:459 kasan_slab_free+0x72/0xc0 mm/kasan/kasan.c:524 slab_free_hook mm/slub.c:1390 slab_free_freelist_hook mm/slub.c:1412 slab_free mm/slub.c:2988 kfree+0xf6/0x2f0 mm/slub.c:3919 snd_usb_mixer_free+0x11a/0x160 sound/usb/mixer.c:2244 snd_usb_mixer_dev_free+0x36/0x50 sound/usb/mixer.c:2250 __snd_device_free+0x1ff/0x380 sound/core/device.c:91 snd_device_free_all+0x8f/0xe0 sound/core/device.c:244 snd_card_do_free sound/core/init.c:461 release_card_device+0x47/0x170 sound/core/init.c:181 device_release+0x13f/0x210 drivers/base/core.c:814 .... Actually such a URB is killed properly at disconnection when the device gets probed successfully, and what we need is to apply it for the error-path, too. In this patch, we apply snd_usb_mixer_disconnect() at releasing. Also introduce a new flag, disconnected, to struct usb_mixer_interface for not performing the disconnection procedure twice. Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/mixer.c | 12 ++++++++++-- sound/usb/mixer.h | 2 ++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/sound/usb/mixer.c b/sound/usb/mixer.c index 696de5ac69be..a23efc8671d6 100644 --- a/sound/usb/mixer.c +++ b/sound/usb/mixer.c @@ -2161,6 +2161,9 @@ static int parse_audio_unit(struct mixer_build *state, int unitid) static void snd_usb_mixer_free(struct usb_mixer_interface *mixer) { + /* kill pending URBs */ + snd_usb_mixer_disconnect(mixer); + kfree(mixer->id_elems); if (mixer->urb) { kfree(mixer->urb->transfer_buffer); @@ -2504,8 +2507,13 @@ int snd_usb_create_mixer(struct snd_usb_audio *chip, int ctrlif, void snd_usb_mixer_disconnect(struct usb_mixer_interface *mixer) { - usb_kill_urb(mixer->urb); - usb_kill_urb(mixer->rc_urb); + if (mixer->disconnected) + return; + if (mixer->urb) + usb_kill_urb(mixer->urb); + if (mixer->rc_urb) + usb_kill_urb(mixer->rc_urb); + mixer->disconnected = true; } #ifdef CONFIG_PM diff --git a/sound/usb/mixer.h b/sound/usb/mixer.h index 2b4b067646ab..545d99b09706 100644 --- a/sound/usb/mixer.h +++ b/sound/usb/mixer.h @@ -22,6 +22,8 @@ struct usb_mixer_interface { struct urb *rc_urb; struct usb_ctrlrequest *rc_setup_packet; u8 rc_buffer[6]; + + bool disconnected; }; #define MAX_CHANNELS 16 /* max logical channels */ From 23709ae9b61429502fcd4686e7a97333f3b3544a Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 11:09:20 +0200 Subject: [PATCH 533/546] ALSA: seq: Fix use-after-free at creating a port commit 71105998845fb012937332fe2e806d443c09e026 upstream. There is a potential race window opened at creating and deleting a port via ioctl, as spotted by fuzzing. snd_seq_create_port() creates a port object and returns its pointer, but it doesn't take the refcount, thus it can be deleted immediately by another thread. Meanwhile, snd_seq_ioctl_create_port() still calls the function snd_seq_system_client_ev_port_start() with the created port object that is being deleted, and this triggers use-after-free like: BUG: KASAN: use-after-free in snd_seq_ioctl_create_port+0x504/0x630 [snd_seq] at addr ffff8801f2241cb1 ============================================================================= BUG kmalloc-512 (Tainted: G B ): kasan: bad access detected ----------------------------------------------------------------------------- INFO: Allocated in snd_seq_create_port+0x94/0x9b0 [snd_seq] age=1 cpu=3 pid=4511 ___slab_alloc+0x425/0x460 __slab_alloc+0x20/0x40 kmem_cache_alloc_trace+0x150/0x190 snd_seq_create_port+0x94/0x9b0 [snd_seq] snd_seq_ioctl_create_port+0xd1/0x630 [snd_seq] snd_seq_do_ioctl+0x11c/0x190 [snd_seq] snd_seq_ioctl+0x40/0x80 [snd_seq] do_vfs_ioctl+0x54b/0xda0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x16/0x75 INFO: Freed in port_delete+0x136/0x1a0 [snd_seq] age=1 cpu=2 pid=4717 __slab_free+0x204/0x310 kfree+0x15f/0x180 port_delete+0x136/0x1a0 [snd_seq] snd_seq_delete_port+0x235/0x350 [snd_seq] snd_seq_ioctl_delete_port+0xc8/0x180 [snd_seq] snd_seq_do_ioctl+0x11c/0x190 [snd_seq] snd_seq_ioctl+0x40/0x80 [snd_seq] do_vfs_ioctl+0x54b/0xda0 SyS_ioctl+0x79/0x90 entry_SYSCALL_64_fastpath+0x16/0x75 Call Trace: [] dump_stack+0x63/0x82 [] print_trailer+0xfb/0x160 [] object_err+0x34/0x40 [] kasan_report.part.2+0x223/0x520 [] ? snd_seq_ioctl_create_port+0x504/0x630 [snd_seq] [] __asan_report_load1_noabort+0x2e/0x30 [] snd_seq_ioctl_create_port+0x504/0x630 [snd_seq] [] ? snd_seq_ioctl_delete_port+0x180/0x180 [snd_seq] [] ? taskstats_exit+0xbc0/0xbc0 [] snd_seq_do_ioctl+0x11c/0x190 [snd_seq] [] snd_seq_ioctl+0x40/0x80 [snd_seq] [] ? acct_account_cputime+0x63/0x80 [] do_vfs_ioctl+0x54b/0xda0 ..... We may fix this in a few different ways, and in this patch, it's fixed simply by taking the refcount properly at snd_seq_create_port() and letting the caller unref the object after use. Also, there is another potential use-after-free by sprintf() call in snd_seq_create_port(), and this is moved inside the lock. This fix covers CVE-2017-15265. Reported-and-tested-by: Michael23 Yu Suggested-by: Linus Torvalds Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/core/seq/seq_clientmgr.c | 6 +++++- sound/core/seq/seq_ports.c | 7 +++++-- 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/sound/core/seq/seq_clientmgr.c b/sound/core/seq/seq_clientmgr.c index e326c1d80416..e847b9923c19 100644 --- a/sound/core/seq/seq_clientmgr.c +++ b/sound/core/seq/seq_clientmgr.c @@ -1260,6 +1260,7 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, struct snd_seq_client_port *port; struct snd_seq_port_info info; struct snd_seq_port_callback *callback; + int port_idx; if (copy_from_user(&info, arg, sizeof(info))) return -EFAULT; @@ -1273,7 +1274,9 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, return -ENOMEM; if (client->type == USER_CLIENT && info.kernel) { - snd_seq_delete_port(client, port->addr.port); + port_idx = port->addr.port; + snd_seq_port_unlock(port); + snd_seq_delete_port(client, port_idx); return -EINVAL; } if (client->type == KERNEL_CLIENT) { @@ -1294,6 +1297,7 @@ static int snd_seq_ioctl_create_port(struct snd_seq_client *client, snd_seq_set_port_info(port, &info); snd_seq_system_client_ev_port_start(port->addr.client, port->addr.port); + snd_seq_port_unlock(port); if (copy_to_user(arg, &info, sizeof(info))) return -EFAULT; diff --git a/sound/core/seq/seq_ports.c b/sound/core/seq/seq_ports.c index fe686ee41c6d..f04714d70bf7 100644 --- a/sound/core/seq/seq_ports.c +++ b/sound/core/seq/seq_ports.c @@ -122,7 +122,9 @@ static void port_subs_info_init(struct snd_seq_port_subs_info *grp) } -/* create a port, port number is returned (-1 on failure) */ +/* create a port, port number is returned (-1 on failure); + * the caller needs to unref the port via snd_seq_port_unlock() appropriately + */ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client, int port) { @@ -151,6 +153,7 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client, snd_use_lock_init(&new_port->use_lock); port_subs_info_init(&new_port->c_src); port_subs_info_init(&new_port->c_dest); + snd_use_lock_use(&new_port->use_lock); num = port >= 0 ? port : 0; mutex_lock(&client->ports_mutex); @@ -165,9 +168,9 @@ struct snd_seq_client_port *snd_seq_create_port(struct snd_seq_client *client, list_add_tail(&new_port->list, &p->list); client->num_ports++; new_port->addr.port = num; /* store the port number in the port */ + sprintf(new_port->name, "port-%d", num); write_unlock_irqrestore(&client->ports_lock, flags); mutex_unlock(&client->ports_mutex); - sprintf(new_port->name, "port-%d", num); return new_port; } From ca2523c9c569186e1e39f5f9db6b593d3f4ccf87 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 10:02:56 +0200 Subject: [PATCH 534/546] ALSA: seq: Fix copy_from_user() call inside lock commit 5803b023881857db32ffefa0d269c90280a67ee0 upstream. The event handler in the virmidi sequencer code takes a read-lock for the linked list traverse, while it's calling snd_seq_dump_var_event() in the loop. The latter function may expand the user-space data depending on the event type. It eventually invokes copy_from_user(), which might be a potential dead-lock. The sequencer core guarantees that the user-space data is passed only with atomic=0 argument, but snd_virmidi_dev_receive_event() ignores it and always takes read-lock(). For avoiding the problem above, this patch introduces rwsem for non-atomic case, while keeping rwlock for atomic case. Also while we're at it: the superfluous irq flags is dropped in snd_virmidi_input_open(). Reported-by: Jia-Ju Bai Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- include/sound/seq_virmidi.h | 1 + sound/core/seq/seq_virmidi.c | 27 +++++++++++++++++++-------- 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/include/sound/seq_virmidi.h b/include/sound/seq_virmidi.h index a03acd0d398a..695257ae64ac 100644 --- a/include/sound/seq_virmidi.h +++ b/include/sound/seq_virmidi.h @@ -60,6 +60,7 @@ struct snd_virmidi_dev { int port; /* created/attached port */ unsigned int flags; /* SNDRV_VIRMIDI_* */ rwlock_t filelist_lock; + struct rw_semaphore filelist_sem; struct list_head filelist; }; diff --git a/sound/core/seq/seq_virmidi.c b/sound/core/seq/seq_virmidi.c index 81134e067184..3b126af4a026 100644 --- a/sound/core/seq/seq_virmidi.c +++ b/sound/core/seq/seq_virmidi.c @@ -77,13 +77,17 @@ static void snd_virmidi_init_event(struct snd_virmidi *vmidi, * decode input event and put to read buffer of each opened file */ static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev, - struct snd_seq_event *ev) + struct snd_seq_event *ev, + bool atomic) { struct snd_virmidi *vmidi; unsigned char msg[4]; int len; - read_lock(&rdev->filelist_lock); + if (atomic) + read_lock(&rdev->filelist_lock); + else + down_read(&rdev->filelist_sem); list_for_each_entry(vmidi, &rdev->filelist, list) { if (!vmidi->trigger) continue; @@ -97,7 +101,10 @@ static int snd_virmidi_dev_receive_event(struct snd_virmidi_dev *rdev, snd_rawmidi_receive(vmidi->substream, msg, len); } } - read_unlock(&rdev->filelist_lock); + if (atomic) + read_unlock(&rdev->filelist_lock); + else + up_read(&rdev->filelist_sem); return 0; } @@ -115,7 +122,7 @@ int snd_virmidi_receive(struct snd_rawmidi *rmidi, struct snd_seq_event *ev) struct snd_virmidi_dev *rdev; rdev = rmidi->private_data; - return snd_virmidi_dev_receive_event(rdev, ev); + return snd_virmidi_dev_receive_event(rdev, ev, true); } #endif /* 0 */ @@ -130,7 +137,7 @@ static int snd_virmidi_event_input(struct snd_seq_event *ev, int direct, rdev = private_data; if (!(rdev->flags & SNDRV_VIRMIDI_USE)) return 0; /* ignored */ - return snd_virmidi_dev_receive_event(rdev, ev); + return snd_virmidi_dev_receive_event(rdev, ev, atomic); } /* @@ -209,7 +216,6 @@ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream) struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_rawmidi_runtime *runtime = substream->runtime; struct snd_virmidi *vmidi; - unsigned long flags; vmidi = kzalloc(sizeof(*vmidi), GFP_KERNEL); if (vmidi == NULL) @@ -223,9 +229,11 @@ static int snd_virmidi_input_open(struct snd_rawmidi_substream *substream) vmidi->client = rdev->client; vmidi->port = rdev->port; runtime->private_data = vmidi; - write_lock_irqsave(&rdev->filelist_lock, flags); + down_write(&rdev->filelist_sem); + write_lock_irq(&rdev->filelist_lock); list_add_tail(&vmidi->list, &rdev->filelist); - write_unlock_irqrestore(&rdev->filelist_lock, flags); + write_unlock_irq(&rdev->filelist_lock); + up_write(&rdev->filelist_sem); vmidi->rdev = rdev; return 0; } @@ -264,9 +272,11 @@ static int snd_virmidi_input_close(struct snd_rawmidi_substream *substream) struct snd_virmidi_dev *rdev = substream->rmidi->private_data; struct snd_virmidi *vmidi = substream->runtime->private_data; + down_write(&rdev->filelist_sem); write_lock_irq(&rdev->filelist_lock); list_del(&vmidi->list); write_unlock_irq(&rdev->filelist_lock); + up_write(&rdev->filelist_sem); snd_midi_event_free(vmidi->parser); substream->runtime->private_data = NULL; kfree(vmidi); @@ -520,6 +530,7 @@ int snd_virmidi_new(struct snd_card *card, int device, struct snd_rawmidi **rrmi rdev->rmidi = rmidi; rdev->device = device; rdev->client = -1; + init_rwsem(&rdev->filelist_sem); rwlock_init(&rdev->filelist_lock); INIT_LIST_HEAD(&rdev->filelist); rdev->seq_mode = SNDRV_VIRMIDI_SEQ_DISPATCH; From 5b01343ad1bdc0d0d0578571395f63072d5d3072 Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Wed, 11 Oct 2017 16:39:02 +0200 Subject: [PATCH 535/546] ALSA: caiaq: Fix stray URB at probe error path commit 99fee508245825765ff60155fed43f970ff83a8f upstream. caiaq driver doesn't kill the URB properly at its error path during the probe, which may lead to a use-after-free error later. This patch addresses it. Reported-by: Johan Hovold Reviewed-by: Johan Hovold Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/caiaq/device.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/sound/usb/caiaq/device.c b/sound/usb/caiaq/device.c index b871ba407e4e..4458190149d1 100644 --- a/sound/usb/caiaq/device.c +++ b/sound/usb/caiaq/device.c @@ -469,10 +469,12 @@ static int init_card(struct snd_usb_caiaqdev *cdev) err = snd_usb_caiaq_send_command(cdev, EP1_CMD_GET_DEVICE_INFO, NULL, 0); if (err) - return err; + goto err_kill_urb; - if (!wait_event_timeout(cdev->ep1_wait_queue, cdev->spec_received, HZ)) - return -ENODEV; + if (!wait_event_timeout(cdev->ep1_wait_queue, cdev->spec_received, HZ)) { + err = -ENODEV; + goto err_kill_urb; + } usb_string(usb_dev, usb_dev->descriptor.iManufacturer, cdev->vendor_name, CAIAQ_USB_STR_LEN); @@ -507,6 +509,10 @@ static int init_card(struct snd_usb_caiaqdev *cdev) setup_card(cdev); return 0; + + err_kill_urb: + usb_kill_urb(&cdev->ep1_in_urb); + return err; } static int snd_probe(struct usb_interface *intf, From 16c1ef65f4db6a3621d653378d27f611c9840bee Mon Sep 17 00:00:00 2001 From: Takashi Iwai Date: Mon, 9 Oct 2017 14:51:23 +0200 Subject: [PATCH 536/546] ALSA: line6: Fix leftover URB at error-path during probe commit c95072b3d88fac4be295815f2b67df366c0c297f upstream. While line6_probe() may kick off URB for a control MIDI endpoint, the function doesn't clean up it properly at its error path. This results in a leftover URB action that is eventually triggered later and causes an Oops like: general protection fault: 0000 [#1] PREEMPT SMP KASAN CPU: 1 PID: 0 Comm: swapper/1 Not tainted RIP: 0010:usb_fill_bulk_urb ./include/linux/usb.h:1619 RIP: 0010:line6_start_listen+0x3fe/0x9e0 sound/usb/line6/driver.c:76 Call Trace: line6_data_received+0x1f7/0x470 sound/usb/line6/driver.c:326 __usb_hcd_giveback_urb+0x2e0/0x650 drivers/usb/core/hcd.c:1779 usb_hcd_giveback_urb+0x337/0x420 drivers/usb/core/hcd.c:1845 dummy_timer+0xba9/0x39f0 drivers/usb/gadget/udc/dummy_hcd.c:1965 call_timer_fn+0x2a2/0x940 kernel/time/timer.c:1281 .... Since the whole clean-up procedure is done in line6_disconnect() callback, we can simply call it in the error path instead of open-coding the whole again. It'll fix such an issue automagically. The bug was spotted by syzkaller. Fixes: eedd0e95d355 ("ALSA: line6: Don't forget to call driver's destructor at error path") Reported-by: Andrey Konovalov Tested-by: Andrey Konovalov Signed-off-by: Takashi Iwai Signed-off-by: Greg Kroah-Hartman --- sound/usb/line6/driver.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/sound/usb/line6/driver.c b/sound/usb/line6/driver.c index 183311cb849e..be78078a10ba 100644 --- a/sound/usb/line6/driver.c +++ b/sound/usb/line6/driver.c @@ -586,9 +586,10 @@ int line6_probe(struct usb_interface *interface, return 0; error: - if (line6->disconnect) - line6->disconnect(line6); - snd_card_free(card); + /* we can call disconnect callback here because no close-sync is + * needed yet at this point + */ + line6_disconnect(interface); return ret; } EXPORT_SYMBOL_GPL(line6_probe); From ac94abbb7941933946bbb7ce9aca7365fa7d5e74 Mon Sep 17 00:00:00 2001 From: Andrew Gabbasov Date: Sat, 30 Sep 2017 08:55:55 -0700 Subject: [PATCH 537/546] usb: gadget: composite: Fix use-after-free in usb_composite_overwrite_options commit aec17e1e249567e82b26dafbb86de7d07fde8729 upstream. KASAN enabled configuration reports an error BUG: KASAN: use-after-free in usb_composite_overwrite_options+... [libcomposite] at addr ... Read of size 1 by task ... when some driver is un-bound and then bound again. For example, this happens with FunctionFS driver when "ffs-test" test application is run several times in a row. If the driver has empty manufacturer ID string in initial static data, it is then replaced with generated string. After driver unbinding the generated string is freed, but the driver data still keep that pointer. And if the driver is then bound again, that pointer is re-used for string emptiness check. The fix is to clean up the driver string data upon its unbinding to drop the pointer to freed memory. Fixes: cc2683c318a5 ("usb: gadget: Provide a default implementation of default manufacturer string") Signed-off-by: Andrew Gabbasov Signed-off-by: Felipe Balbi Signed-off-by: Greg Kroah-Hartman --- drivers/usb/gadget/composite.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/usb/gadget/composite.c b/drivers/usb/gadget/composite.c index e2641d4dfdd6..d186d0282a38 100644 --- a/drivers/usb/gadget/composite.c +++ b/drivers/usb/gadget/composite.c @@ -1905,6 +1905,8 @@ static DEVICE_ATTR_RO(suspended); static void __composite_unbind(struct usb_gadget *gadget, bool unbind_driver) { struct usb_composite_dev *cdev = get_gadget_data(gadget); + struct usb_gadget_strings *gstr = cdev->driver->strings[0]; + struct usb_string *dev_str = gstr->strings; /* composite_disconnect() must already have been called * by the underlying peripheral controller driver! @@ -1924,6 +1926,9 @@ static void __composite_unbind(struct usb_gadget *gadget, bool unbind_driver) composite_dev_cleanup(cdev); + if (dev_str[USB_GADGET_MANUFACTURER_IDX].s == cdev->def_manufacturer) + dev_str[USB_GADGET_MANUFACTURER_IDX].s = ""; + kfree(cdev->def_manufacturer); kfree(cdev); set_gadget_data(gadget, NULL); From f3b538493e665aa083388818b975ccf27979dec6 Mon Sep 17 00:00:00 2001 From: Andreas Gruenbacher Date: Mon, 9 Oct 2017 11:13:18 +0200 Subject: [PATCH 538/546] direct-io: Prevent NULL pointer access in submit_page_section commit 899f0429c7d3eed886406cd72182bee3b96aa1f9 upstream. In the code added to function submit_page_section by commit b1058b981, sdio->bio can currently be NULL when calling dio_bio_submit. This then leads to a NULL pointer access in dio_bio_submit, so check for a NULL bio in submit_page_section before trying to submit it instead. Fixes xfstest generic/250 on gfs2. Signed-off-by: Andreas Gruenbacher Reviewed-by: Jan Kara Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- fs/direct-io.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index c772fdf36cd9..44f49d86d714 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -823,7 +823,8 @@ submit_page_section(struct dio *dio, struct dio_submit *sdio, struct page *page, */ if (sdio->boundary) { ret = dio_send_cur_page(dio, sdio, map_bh); - dio_bio_submit(dio, sdio); + if (sdio->bio) + dio_bio_submit(dio, sdio); page_cache_release(sdio->cur_page); sdio->cur_page = NULL; } From 399c46095eb5d41934c8a99732028a9f1cf0ac50 Mon Sep 17 00:00:00 2001 From: Vitaly Mayatskikh Date: Fri, 22 Sep 2017 01:18:39 -0400 Subject: [PATCH 539/546] fix unbalanced page refcounting in bio_map_user_iov commit 95d78c28b5a85bacbc29b8dba7c04babb9b0d467 upstream. bio_map_user_iov and bio_unmap_user do unbalanced pages refcounting if IO vector has small consecutive buffers belonging to the same page. bio_add_pc_page merges them into one, but the page reference is never dropped. Signed-off-by: Vitaly Mayatskikh Signed-off-by: Al Viro Signed-off-by: Greg Kroah-Hartman --- block/bio.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/block/bio.c b/block/bio.c index 14263fab94d3..68bbc835bacc 100644 --- a/block/bio.c +++ b/block/bio.c @@ -1320,6 +1320,7 @@ struct bio *bio_map_user_iov(struct request_queue *q, offset = uaddr & ~PAGE_MASK; for (j = cur_page; j < page_limit; j++) { unsigned int bytes = PAGE_SIZE - offset; + unsigned short prev_bi_vcnt = bio->bi_vcnt; if (len <= 0) break; @@ -1334,6 +1335,13 @@ struct bio *bio_map_user_iov(struct request_queue *q, bytes) break; + /* + * check if vector was merged with previous + * drop page reference if needed + */ + if (bio->bi_vcnt == prev_bi_vcnt) + put_page(pages[j]); + len -= bytes; offset = 0; } From b1f5a26964bf28e1c75c2af61502c27fc6fd03a0 Mon Sep 17 00:00:00 2001 From: Jeffrey Chu Date: Fri, 8 Sep 2017 21:08:58 +0000 Subject: [PATCH 540/546] USB: serial: ftdi_sio: add id for Cypress WICED dev board commit a6c215e21b0dc5fe9416dce90f9acc2ea53c4502 upstream. Add CYPRESS_VID vid and CYPRESS_WICED_BT_USB and CYPRESS_WICED_WL_USB device IDs to ftdi_sio driver. Signed-off-by: Jeffrey Chu Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/ftdi_sio.c | 2 ++ drivers/usb/serial/ftdi_sio_ids.h | 7 +++++++ 2 files changed, 9 insertions(+) diff --git a/drivers/usb/serial/ftdi_sio.c b/drivers/usb/serial/ftdi_sio.c index e0385d6c0abb..30344efc123f 100644 --- a/drivers/usb/serial/ftdi_sio.c +++ b/drivers/usb/serial/ftdi_sio.c @@ -1015,6 +1015,8 @@ static const struct usb_device_id id_table_combined[] = { { USB_DEVICE(WICED_VID, WICED_USB20706V2_PID) }, { USB_DEVICE(TI_VID, TI_CC3200_LAUNCHPAD_PID), .driver_info = (kernel_ulong_t)&ftdi_jtag_quirk }, + { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_BT_USB_PID) }, + { USB_DEVICE(CYPRESS_VID, CYPRESS_WICED_WL_USB_PID) }, { } /* Terminating entry */ }; diff --git a/drivers/usb/serial/ftdi_sio_ids.h b/drivers/usb/serial/ftdi_sio_ids.h index 4fcf1cecb6d7..f9d15bd62785 100644 --- a/drivers/usb/serial/ftdi_sio_ids.h +++ b/drivers/usb/serial/ftdi_sio_ids.h @@ -609,6 +609,13 @@ #define ADI_GNICE_PID 0xF000 #define ADI_GNICEPLUS_PID 0xF001 +/* + * Cypress WICED USB UART + */ +#define CYPRESS_VID 0x04B4 +#define CYPRESS_WICED_BT_USB_PID 0x009B +#define CYPRESS_WICED_WL_USB_PID 0xF900 + /* * Microchip Technology, Inc. * From ac22f49fb845db0264946efe71a2a810c8639f32 Mon Sep 17 00:00:00 2001 From: Andreas Engel Date: Mon, 18 Sep 2017 21:11:57 +0200 Subject: [PATCH 541/546] USB: serial: cp210x: add support for ELV TFD500 commit c496ad835c31ad639b6865714270b3003df031f6 upstream. Add the USB device id for the ELV TFD500 data logger. Signed-off-by: Andreas Engel Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/cp210x.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/cp210x.c b/drivers/usb/serial/cp210x.c index 41a6513646de..1f5ecf905b7d 100644 --- a/drivers/usb/serial/cp210x.c +++ b/drivers/usb/serial/cp210x.c @@ -170,6 +170,7 @@ static const struct usb_device_id id_table[] = { { USB_DEVICE(0x1843, 0x0200) }, /* Vaisala USB Instrument Cable */ { USB_DEVICE(0x18EF, 0xE00F) }, /* ELV USB-I2C-Interface */ { USB_DEVICE(0x18EF, 0xE025) }, /* ELV Marble Sound Board 1 */ + { USB_DEVICE(0x18EF, 0xE032) }, /* ELV TFD500 Data Logger */ { USB_DEVICE(0x1901, 0x0190) }, /* GE B850 CP2105 Recorder interface */ { USB_DEVICE(0x1901, 0x0193) }, /* GE B650 CP2104 PMC interface */ { USB_DEVICE(0x1901, 0x0194) }, /* GE Healthcare Remote Alarm Box */ From 34592e06c7af1f1bdac639beebfb352b061f8070 Mon Sep 17 00:00:00 2001 From: Henryk Heisig Date: Mon, 11 Sep 2017 17:57:34 +0200 Subject: [PATCH 542/546] USB: serial: option: add support for TP-Link LTE module commit 837ddc4793a69b256ac5e781a5e729b448a8d983 upstream. This commit adds support for TP-Link LTE mPCIe module is used in in TP-Link MR200v1, MR6400v1 and v2 routers. Signed-off-by: Henryk Heisig Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/option.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/usb/serial/option.c b/drivers/usb/serial/option.c index 2a9944326210..db3d34c2c82e 100644 --- a/drivers/usb/serial/option.c +++ b/drivers/usb/serial/option.c @@ -522,6 +522,7 @@ static void option_instat_callback(struct urb *urb); /* TP-LINK Incorporated products */ #define TPLINK_VENDOR_ID 0x2357 +#define TPLINK_PRODUCT_LTE 0x000D #define TPLINK_PRODUCT_MA180 0x0201 /* Changhong products */ @@ -2011,6 +2012,7 @@ static const struct usb_device_id option_ids[] = { { USB_DEVICE(CELLIENT_VENDOR_ID, CELLIENT_PRODUCT_MEN200) }, { USB_DEVICE(PETATEL_VENDOR_ID, PETATEL_PRODUCT_NP10T_600A) }, { USB_DEVICE(PETATEL_VENDOR_ID, PETATEL_PRODUCT_NP10T_600E) }, + { USB_DEVICE_AND_INTERFACE_INFO(TPLINK_VENDOR_ID, TPLINK_PRODUCT_LTE, 0xff, 0x00, 0x00) }, /* TP-Link LTE Module */ { USB_DEVICE(TPLINK_VENDOR_ID, TPLINK_PRODUCT_MA180), .driver_info = (kernel_ulong_t)&net_intf4_blacklist }, { USB_DEVICE(TPLINK_VENDOR_ID, 0x9000), /* TP-Link MA260 */ From 6c14436b5e848fb7b308b525cad546a37172a288 Mon Sep 17 00:00:00 2001 From: Shrirang Bagul Date: Fri, 29 Sep 2017 12:39:51 +0800 Subject: [PATCH 543/546] USB: serial: qcserial: add Dell DW5818, DW5819 commit f5d9644c5fca7d8e8972268598bb516a7eae17f9 upstream. Dell Wireless 5819/5818 devices are re-branded Sierra Wireless MC74 series which will by default boot with vid 0x413c and pid's 0x81cf, 0x81d0, 0x81d1, 0x81d2. Signed-off-by: Shrirang Bagul Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/qcserial.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/usb/serial/qcserial.c b/drivers/usb/serial/qcserial.c index 652b4334b26d..e1c1e329c877 100644 --- a/drivers/usb/serial/qcserial.c +++ b/drivers/usb/serial/qcserial.c @@ -174,6 +174,10 @@ static const struct usb_device_id id_table[] = { {DEVICE_SWI(0x413c, 0x81b3)}, /* Dell Wireless 5809e Gobi(TM) 4G LTE Mobile Broadband Card (rev3) */ {DEVICE_SWI(0x413c, 0x81b5)}, /* Dell Wireless 5811e QDL */ {DEVICE_SWI(0x413c, 0x81b6)}, /* Dell Wireless 5811e QDL */ + {DEVICE_SWI(0x413c, 0x81cf)}, /* Dell Wireless 5819 */ + {DEVICE_SWI(0x413c, 0x81d0)}, /* Dell Wireless 5819 */ + {DEVICE_SWI(0x413c, 0x81d1)}, /* Dell Wireless 5818 */ + {DEVICE_SWI(0x413c, 0x81d2)}, /* Dell Wireless 5818 */ /* Huawei devices */ {DEVICE_HWI(0x03f0, 0x581d)}, /* HP lt4112 LTE/HSPA+ Gobi 4G Modem (Huawei me906e) */ From 208563455aac7540755bb9d8e8edaf7c5ef61d8c Mon Sep 17 00:00:00 2001 From: Johan Hovold Date: Wed, 4 Oct 2017 11:01:13 +0200 Subject: [PATCH 544/546] USB: serial: console: fix use-after-free after failed setup commit 299d7572e46f98534033a9e65973f13ad1ce9047 upstream. Make sure to reset the USB-console port pointer when console setup fails in order to avoid having the struct usb_serial be prematurely freed by the console code when the device is later disconnected. Fixes: 73e487fdb75f ("[PATCH] USB console: fix disconnection issues") Acked-by: Greg Kroah-Hartman Signed-off-by: Johan Hovold Signed-off-by: Greg Kroah-Hartman --- drivers/usb/serial/console.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/usb/serial/console.c b/drivers/usb/serial/console.c index 3806e7014199..2938153fe7b1 100644 --- a/drivers/usb/serial/console.c +++ b/drivers/usb/serial/console.c @@ -189,6 +189,7 @@ static int usb_console_setup(struct console *co, char *options) tty_kref_put(tty); reset_open_count: port->port.count = 0; + info->port = NULL; usb_autopm_put_interface(serial->interface); error_get_interface: usb_serial_put(serial); From ad505a7b4fb0fd8d533f78083df203d88cdf1a27 Mon Sep 17 00:00:00 2001 From: Mathias Krause Date: Thu, 5 Oct 2017 20:30:12 +0200 Subject: [PATCH 545/546] x86/alternatives: Fix alt_max_short macro to really be a max() commit 6b32c126d33d5cb379bca280ab8acedc1ca978ff upstream. The alt_max_short() macro in asm/alternative.h does not work as intended, leading to nasty bugs. E.g. alt_max_short("1", "3") evaluates to 3, but alt_max_short("3", "1") evaluates to 1 -- not exactly the maximum of 1 and 3. In fact, I had to learn it the hard way by crashing my kernel in not so funny ways by attempting to make use of the ALTENATIVE_2 macro with alternatives where the first one was larger than the second one. According to [1] and commit dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") the right handed side should read "-(-(a < b))" not "-(-(a - b))". Fix that, to make the macro work as intended. While at it, fix up the comments regarding the additional "-", too. It's not about gas' usage of s32 but brain dead logic of having a "true" value of -1 for the < operator ... *sigh* Btw., the one in asm/alternative-asm.h is correct. And, apparently, all current users of ALTERNATIVE_2() pass same sized alternatives, avoiding to hit the bug. [1] http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax Reviewed-and-tested-by: Borislav Petkov Fixes: dbe4058a6a44 ("x86/alternatives: Fix ALTERNATIVE_2 padding generation properly") Signed-off-by: Mathias Krause Signed-off-by: Thomas Gleixner Cc: Borislav Petkov Link: https://lkml.kernel.org/r/1507228213-13095-1-git-send-email-minipli@googlemail.com Signed-off-by: Greg Kroah-Hartman --- arch/x86/include/asm/alternative-asm.h | 4 +++- arch/x86/include/asm/alternative.h | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index e7636bac7372..6c98821fef5e 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 7bfc85bbb8ff..09936e9c8154 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -102,12 +102,12 @@ static inline int alternatives_text_reserved(void *start, void *end) alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is From e1fe3813117f465a2db200aebb13969056986c64 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Wed, 18 Oct 2017 09:20:43 +0200 Subject: [PATCH 546/546] Linux 4.4.93 --- Makefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Makefile b/Makefile index fab2d640a27e..77a17fb24b6d 100644 --- a/Makefile +++ b/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 4 -SUBLEVEL = 92 +SUBLEVEL = 93 EXTRAVERSION = NAME = Blurry Fish Butt