From aa7ab1e20882b04fc3e45da77a9dad5cbbefba99 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:48 -0400 Subject: [PATCH 001/271] fs: dlm: synchronize dlm before shutdown This patch moves the dlm workqueue dlm synchronization before shutdown handling. The patch just flushes all pending work before starting to shutdown the connection. At least for the send_workqeue we should flush the workqueue to make sure there is no new connection handling going on as dlm_allow_conn switch is turned to false before. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 5050fe05769b..ed098870ba0d 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1624,10 +1624,6 @@ static void work_flush(void) struct hlist_node *n; struct connection *con; - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); do { ok = 1; foreach_conn(stop_conn); @@ -1659,6 +1655,12 @@ void dlm_lowcomms_stop(void) mutex_lock(&connections_lock); dlm_allow_conn = 0; mutex_unlock(&connections_lock); + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + foreach_conn(shutdown_conn); work_flush(); clean_writequeues(); From a47666eb763cc1b8b48bd88185ca56676f40ca89 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:49 -0400 Subject: [PATCH 002/271] fs: dlm: make connection hash lockless There are some problems with the connections_lock. During my experiements I saw sometimes circular dependencies with sock_lock. The reason here might be code parts which runs nodeid2con() before or after sock_lock is acquired. Another issue are missing locks in for_conn() iteration. Maybe this works fine because for_conn() is running in a context where connection_hash cannot be manipulated by others anymore. However this patch changes the connection_hash to be protected by sleepable rcu. The hotpath function __find_con() is implemented lockless as it is only a reader of connection_hash and this hopefully fixes the circular locking dependencies. The iteration for_conn() will still call some sleepable functionality, that's why we use sleepable rcu in this case. This patch removes the kmemcache functionality as I think I need to make some free() functionality via call_rcu(). However allocation time isn't here an issue. The dlm_allow_con will not be protected by a lock anymore as I think it's enough to just set and flush workqueues afterwards. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/Kconfig | 1 + fs/dlm/lowcomms.c | 86 ++++++++++++++++++++--------------------------- 2 files changed, 37 insertions(+), 50 deletions(-) diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..ee92634196a8 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -4,6 +4,7 @@ menuconfig DLM depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) select IP_SCTP + select SRCU help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index ed098870ba0d..9db7126de793 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -126,6 +126,7 @@ struct connection { struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ + struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -167,8 +168,8 @@ static struct workqueue_struct *recv_workqueue; static struct workqueue_struct *send_workqueue; static struct hlist_head connection_hash[CONN_HASH_SIZE]; -static DEFINE_MUTEX(connections_lock); -static struct kmem_cache *con_cache; +static DEFINE_SPINLOCK(connections_lock); +DEFINE_STATIC_SRCU(connections_srcu); static void process_recv_sockets(struct work_struct *work); static void process_send_sockets(struct work_struct *work); @@ -184,15 +185,20 @@ static inline int nodeid_hash(int nodeid) static struct connection *__find_con(int nodeid) { - int r; + int r, idx; struct connection *con; r = nodeid_hash(nodeid); - hlist_for_each_entry(con, &connection_hash[r], list) { - if (con->nodeid == nodeid) + idx = srcu_read_lock(&connections_srcu); + hlist_for_each_entry_rcu(con, &connection_hash[r], list) { + if (con->nodeid == nodeid) { + srcu_read_unlock(&connections_srcu, idx); return con; + } } + srcu_read_unlock(&connections_srcu, idx); + return NULL; } @@ -200,7 +206,7 @@ static struct connection *__find_con(int nodeid) * If 'allocation' is zero then we don't attempt to create a new * connection structure for this node. */ -static struct connection *__nodeid2con(int nodeid, gfp_t alloc) +static struct connection *nodeid2con(int nodeid, gfp_t alloc) { struct connection *con = NULL; int r; @@ -209,13 +215,10 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) if (con || !alloc) return con; - con = kmem_cache_zalloc(con_cache, alloc); + con = kzalloc(sizeof(*con), alloc); if (!con) return NULL; - r = nodeid_hash(nodeid); - hlist_add_head(&con->list, &connection_hash[r]); - con->nodeid = nodeid; mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); @@ -233,31 +236,27 @@ static struct connection *__nodeid2con(int nodeid, gfp_t alloc) con->rx_action = zerocon->rx_action; } + r = nodeid_hash(nodeid); + + spin_lock(&connections_lock); + hlist_add_head_rcu(&con->list, &connection_hash[r]); + spin_unlock(&connections_lock); + return con; } /* Loop round all connections */ static void foreach_conn(void (*conn_func)(struct connection *c)) { - int i; - struct hlist_node *n; + int i, idx; struct connection *con; + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE; i++) { - hlist_for_each_entry_safe(con, n, &connection_hash[i], list) + hlist_for_each_entry_rcu(con, &connection_hash[i], list) conn_func(con); } -} - -static struct connection *nodeid2con(int nodeid, gfp_t allocation) -{ - struct connection *con; - - mutex_lock(&connections_lock); - con = __nodeid2con(nodeid, allocation); - mutex_unlock(&connections_lock); - - return con; + srcu_read_unlock(&connections_srcu, idx); } static struct dlm_node_addr *find_node_addr(int nodeid) @@ -792,12 +791,9 @@ static int accept_from_sock(struct connection *con) struct connection *newcon; struct connection *addcon; - mutex_lock(&connections_lock); if (!dlm_allow_conn) { - mutex_unlock(&connections_lock); return -1; } - mutex_unlock(&connections_lock); mutex_lock_nested(&con->sock_mutex, 0); @@ -847,7 +843,7 @@ static int accept_from_sock(struct connection *con) struct connection *othercon = newcon->othercon; if (!othercon) { - othercon = kmem_cache_zalloc(con_cache, GFP_NOFS); + othercon = kzalloc(sizeof(*othercon), GFP_NOFS); if (!othercon) { log_print("failed to allocate incoming socket"); mutex_unlock(&newcon->sock_mutex); @@ -1612,16 +1608,17 @@ static void free_conn(struct connection *con) { close_connection(con, true, true, true); if (con->othercon) - kmem_cache_free(con_cache, con->othercon); - hlist_del(&con->list); - kmem_cache_free(con_cache, con); + kfree_rcu(con->othercon, rcu); + spin_lock(&connections_lock); + hlist_del_rcu(&con->list); + spin_unlock(&connections_lock); + kfree_rcu(con, rcu); } static void work_flush(void) { - int ok; + int ok, idx; int i; - struct hlist_node *n; struct connection *con; do { @@ -1631,9 +1628,10 @@ static void work_flush(void) flush_workqueue(recv_workqueue); if (send_workqueue) flush_workqueue(send_workqueue); + idx = srcu_read_lock(&connections_srcu); for (i = 0; i < CONN_HASH_SIZE && ok; i++) { - hlist_for_each_entry_safe(con, n, - &connection_hash[i], list) { + hlist_for_each_entry_rcu(con, &connection_hash[i], + list) { ok &= test_bit(CF_READ_PENDING, &con->flags); ok &= test_bit(CF_WRITE_PENDING, &con->flags); if (con->othercon) { @@ -1644,6 +1642,7 @@ static void work_flush(void) } } } + srcu_read_unlock(&connections_srcu, idx); } while (!ok); } @@ -1652,9 +1651,7 @@ void dlm_lowcomms_stop(void) /* Set all the flags to prevent any socket activity. */ - mutex_lock(&connections_lock); dlm_allow_conn = 0; - mutex_unlock(&connections_lock); if (recv_workqueue) flush_workqueue(recv_workqueue); @@ -1666,8 +1663,6 @@ void dlm_lowcomms_stop(void) clean_writequeues(); foreach_conn(free_conn); work_stop(); - - kmem_cache_destroy(con_cache); } int dlm_lowcomms_start(void) @@ -1686,16 +1681,9 @@ int dlm_lowcomms_start(void) goto fail; } - error = -ENOMEM; - con_cache = kmem_cache_create("dlm_conn", sizeof(struct connection), - __alignof__(struct connection), 0, - NULL); - if (!con_cache) - goto fail; - error = work_start(); if (error) - goto fail_destroy; + goto fail; dlm_allow_conn = 1; @@ -1714,10 +1702,8 @@ int dlm_lowcomms_start(void) con = nodeid2con(0,0); if (con) { close_connection(con, false, true, true); - kmem_cache_free(con_cache, con); + kfree_rcu(con, rcu); } -fail_destroy: - kmem_cache_destroy(con_cache); fail: return error; } From 043697f030c5c7889682c82f08e05adeb613939a Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:50 -0400 Subject: [PATCH 003/271] fs: dlm: fix dlm_local_addr memory leak This patch fixes the following memory detected by kmemleak and umount gfs2 filesystem which removed the last lockspace: unreferenced object 0xffff9264f4f48f00 (size 128): comm "mount", pid 425, jiffies 4294690253 (age 48.159s) hex dump (first 32 bytes): 02 00 52 48 c0 a8 7a fb 00 00 00 00 00 00 00 00 ..RH..z......... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<0000000067a34940>] kmemdup+0x18/0x40 [<00000000c935f9ab>] init_local+0x4c/0xa0 [<00000000bbd286ef>] dlm_lowcomms_start+0x28/0x160 [<00000000a86625cb>] dlm_new_lockspace+0x7e/0xb80 [<000000008df6cd63>] gdlm_mount+0x1cc/0x5de [<00000000b67df8c7>] gfs2_lm_mount.constprop.0+0x1a3/0x1d3 [<000000006642ac5e>] gfs2_fill_super+0x717/0xba9 [<00000000d3ab7118>] get_tree_bdev+0x17f/0x280 [<000000001975926e>] gfs2_get_tree+0x21/0x90 [<00000000561ce1c4>] vfs_get_tree+0x28/0xc0 [<000000007fecaf63>] path_mount+0x434/0xc00 [<00000000636b9594>] __x64_sys_mount+0xe3/0x120 [<00000000cc478a33>] do_syscall_64+0x33/0x40 [<00000000ce9ccf01>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 9db7126de793..d0ece252a0d9 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1234,6 +1234,14 @@ static void init_local(void) } } +static void deinit_local(void) +{ + int i; + + for (i = 0; i < dlm_local_count; i++) + kfree(dlm_local_addr[i]); +} + /* Initialise SCTP socket and bind to all interfaces */ static int sctp_listen_for_all(void) { @@ -1663,6 +1671,7 @@ void dlm_lowcomms_stop(void) clean_writequeues(); foreach_conn(free_conn); work_stop(); + deinit_local(); } int dlm_lowcomms_start(void) From 3d2825c8c6105b0f36f3ff72760799fa2e71420e Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:51 -0400 Subject: [PATCH 004/271] fs: dlm: fix configfs memory leak This patch fixes the following memory detected by kmemleak and umount gfs2 filesystem which removed the last lockspace: unreferenced object 0xffff9264f482f600 (size 192): comm "dlm_controld", pid 325, jiffies 4294690276 (age 48.136s) hex dump (first 32 bytes): 00 00 00 00 00 00 00 00 6e 6f 64 65 73 00 00 00 ........nodes... 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 ................ backtrace: [<00000000060481d7>] make_space+0x41/0x130 [<000000008d905d46>] configfs_mkdir+0x1a2/0x5f0 [<00000000729502cf>] vfs_mkdir+0x155/0x210 [<000000000369bcf1>] do_mkdirat+0x6d/0x110 [<00000000cc478a33>] do_syscall_64+0x33/0x40 [<00000000ce9ccf01>] entry_SYSCALL_64_after_hwframe+0x44/0xa9 The patch just remembers the "nodes" entry pointer in space as I think it's created as subdirectory when parent "spaces" is created. In function drop_space() we will lost the pointer reference to nds because configfs_remove_default_groups(). However as this subdirectory is always available when "spaces" exists it will just be freed when "spaces" will be freed. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 47f0b98b707f..f33a7e4ae917 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -221,6 +221,7 @@ struct dlm_space { struct list_head members; struct mutex members_lock; int members_count; + struct dlm_nodes *nds; }; struct dlm_comms { @@ -430,6 +431,7 @@ static struct config_group *make_space(struct config_group *g, const char *name) INIT_LIST_HEAD(&sp->members); mutex_init(&sp->members_lock); sp->members_count = 0; + sp->nds = nds; return &sp->group; fail: @@ -451,6 +453,7 @@ static void drop_space(struct config_group *g, struct config_item *i) static void release_space(struct config_item *i) { struct dlm_space *sp = config_item_to_space(i); + kfree(sp->nds); kfree(sp); } From 0de984323ac56aa420e6f28d7ce205a293fdb649 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:52 -0400 Subject: [PATCH 005/271] fs: dlm: move free writequeue into con free This patch just move the free of struct connection member writequeue into the functionality when struct connection will be freed instead of doing two iterations. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 9 +-------- 1 file changed, 1 insertion(+), 8 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d0ece252a0d9..04afc7178afb 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1550,13 +1550,6 @@ static void process_send_sockets(struct work_struct *work) send_to_sock(con); } - -/* Discard all entries on the write queues */ -static void clean_writequeues(void) -{ - foreach_conn(clean_one_writequeue); -} - static void work_stop(void) { if (recv_workqueue) @@ -1620,6 +1613,7 @@ static void free_conn(struct connection *con) spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); + clean_one_writequeue(con); kfree_rcu(con, rcu); } @@ -1668,7 +1662,6 @@ void dlm_lowcomms_stop(void) foreach_conn(shutdown_conn); work_flush(); - clean_writequeues(); foreach_conn(free_conn); work_stop(); deinit_local(); From 948c47e9bcb6a42229cb1da1cc350c887a33ebb8 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:53 -0400 Subject: [PATCH 006/271] fs: dlm: handle possible othercon writequeues This patch adds free of possible other writequeue entries in othercon member of struct connection. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 04afc7178afb..794216eb728c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1608,11 +1608,13 @@ static void shutdown_conn(struct connection *con) static void free_conn(struct connection *con) { close_connection(con, true, true, true); - if (con->othercon) - kfree_rcu(con->othercon, rcu); spin_lock(&connections_lock); hlist_del_rcu(&con->list); spin_unlock(&connections_lock); + if (con->othercon) { + clean_one_writequeue(con->othercon); + kfree_rcu(con->othercon, rcu); + } clean_one_writequeue(con); kfree_rcu(con, rcu); } From 7ae0451e2e6c29ff9fc17754b1129d9491177634 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 27 Aug 2020 15:02:54 -0400 Subject: [PATCH 007/271] fs: dlm: use free_con to free connection This patch use free_con() functionality to free the listen connection if listen fails. It also fixes an issue that a freed resource is still part of the connection_hash as hlist_del() is not called in this case. The only difference is that free_con() handles othercon as well, but this is never been set for the listen connection. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 794216eb728c..1bf1808bfa6b 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -1704,10 +1704,8 @@ int dlm_lowcomms_start(void) fail_unlisten: dlm_allow_conn = 0; con = nodeid2con(0,0); - if (con) { - close_connection(con, false, true, true); - kfree_rcu(con, rcu); - } + if (con) + free_conn(con); fail: return error; } From 9481b45ca9b951e0ad406c9d1f6f90ad6ff56a7d Mon Sep 17 00:00:00 2001 From: "Alexander A. Klimov" Date: Sat, 18 Jul 2020 11:06:14 +0200 Subject: [PATCH 008/271] mmc: sdhci_am654: Replace HTTP links with HTTPS ones Rationale: Reduces attack surface on kernel devs opening the links for MITM as HTTPS traffic is much harder to manipulate. Deterministic algorithm: For each file: If not .svg: For each line: If doesn't contain `\bxmlns\b`: For each link, `\bhttp://[^# \t\r\n]*(?:\w|/)`: If neither `\bgnu\.org/license`, nor `\bmozilla\.org/MPL\b`: If both the HTTP and HTTPS versions return 200 OK and serve the same content: Replace HTTP with HTTPS. Signed-off-by: Alexander A. Klimov Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20200718090614.88946-1-grandmaster@al2klimov.de Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index f9d24af12396..0f3ed716d3d2 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -2,7 +2,7 @@ /* * sdhci_am654.c - SDHCI driver for TI's AM654 SOCs * - * Copyright (C) 2018 Texas Instruments Incorporated - http://www.ti.com + * Copyright (C) 2018 Texas Instruments Incorporated - https://www.ti.com * */ #include From d99654f62362e08fb203bb9f5b6ffbd2c00ad8b6 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Sun, 2 Aug 2020 12:31:14 +0530 Subject: [PATCH 009/271] dt-bindings: mmc: sdhci-am654: Document bindings for the host controllers on TI's J7200 devices Add binding documentation for mmc host controllers present on TI's J7200 SOC Signed-off-by: Faiz Abbas Acked-by: Rob Herring Link: https://lore.kernel.org/r/20200802070114.9624-1-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/sdhci-am654.txt | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt index 6d202f4d9249..b49cbfdd679f 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt +++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt @@ -12,6 +12,8 @@ Required Properties: "ti,am654-sdhci-5.1": SDHCI on AM654 device. "ti,j721e-sdhci-8bit": 8 bit SDHCI on J721E device. "ti,j721e-sdhci-4bit": 4 bit SDHCI on J721E device. + "ti,j7200-sdhci-8bit": 8 bit SDHCI on J7200 device. + "ti,j7200-sdhci-4bit": 4 bit SDHCI on J7200 device. - reg: Must be two entries. - The first should be the sdhci register space - The second should the subsystem/phy register space @@ -33,7 +35,9 @@ Required Properties: These bindings must be provided otherwise the driver will disable the corresponding speed mode (i.e. all nodes must provide at least -legacy) -Optional Properties (Required for ti,am654-sdhci-5.1 and ti,j721e-sdhci-8bit): +Optional Properties (Required for ti,am654-sdhci-5.1, + ti,j721e-sdhci-8bit, + ti,j7200-sdhci-8bit): - ti,trm-icp: DLL trim select - ti,driver-strength-ohm: driver strength in ohms. Valid values are 33, 40, 50, 66 and 100 ohms. From bb82d3b57c35eb91918964954e6c8efcc1018ce3 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Tue, 4 Aug 2020 17:13:45 +0200 Subject: [PATCH 010/271] mmc: s3cmci: remove empty kerneldoc comment Remove empty comment to fix W=1 compile warning: drivers/mmc/host/s3cmci.c:400: warning: Cannot understand * on line 400 - I thought it was a doc line Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200804151345.28005-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/s3cmci.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index 444b2769ae2c..b5df948f8155 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -396,9 +396,6 @@ static void s3cmci_enable_irq(struct s3cmci_host *host, bool more) local_irq_restore(flags); } -/** - * - */ static void s3cmci_disable_irq(struct s3cmci_host *host, bool transfer) { unsigned long flags; From ce54fb38fa4291af8b39c70b114bdd30aaf0fd59 Mon Sep 17 00:00:00 2001 From: Guo Ren Date: Sat, 8 Aug 2020 11:45:02 +0000 Subject: [PATCH 011/271] mmc: Kconfig: Add RISCV and CSKY for MMC_DW Synopsys DesignWare MMC controller could be used in RISC-V and C-SKY architectures. Signed-off-by: Guo Ren Cc: Ulf Hansson Link: https://lore.kernel.org/r/1596887102-9743-1-git-send-email-guoren@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 9c89a5b780e8..10adaf91faaa 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -777,7 +777,7 @@ config MMC_CAVIUM_THUNDERX config MMC_DW tristate "Synopsys DesignWare Memory Card Interface" - depends on ARC || ARM || ARM64 || MIPS || COMPILE_TEST + depends on ARC || ARM || ARM64 || MIPS || RISCV || CSKY || COMPILE_TEST help This selects support for the Synopsys DesignWare Mobile Storage IP block, this provides host support for SD and MMC interfaces, in both From 9e1adca2019b8bff3da61baef7b42042e6aa9b01 Mon Sep 17 00:00:00 2001 From: Lad Prabhakar Date: Sun, 9 Aug 2020 20:20:01 +0100 Subject: [PATCH 012/271] dt-bindings: mmc: renesas,sdhi: Add r8a774e1 support Document SDHI controller for RZ/G2H (R8A774E1) SoC, which is compatible with R-Car Gen3 SoC family. Signed-off-by: Lad Prabhakar Reviewed-by: Marian-Cristian Rotariu Reviewed-by: Geert Uytterhoeven Acked-by: Rob Herring Link: https://lore.kernel.org/r/20200809192001.19156-1-prabhakar.mahadev-lad.rj@bp.renesas.com Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml | 1 + 1 file changed, 1 insertion(+) diff --git a/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml b/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml index b4c3fd40caeb..6bbf29b5c239 100644 --- a/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml +++ b/Documentation/devicetree/bindings/mmc/renesas,sdhi.yaml @@ -50,6 +50,7 @@ properties: - renesas,sdhi-r8a774a1 # RZ/G2M - renesas,sdhi-r8a774b1 # RZ/G2N - renesas,sdhi-r8a774c0 # RZ/G2E + - renesas,sdhi-r8a774e1 # RZ/G2H - renesas,sdhi-r8a7795 # R-Car H3 - renesas,sdhi-r8a7796 # R-Car M3-W - renesas,sdhi-r8a77961 # R-Car M3-W+ From 309de450d78e9e83036f8b696596d8e7aa078f0d Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Mon, 10 Aug 2020 09:02:47 +0100 Subject: [PATCH 013/271] ms_block: fix spelling mistake "doesn'" -> "doesn't" There is a spelling mistake in a debug message. Fix it. Signed-off-by: Colin Ian King Link: https://lore.kernel.org/r/20200810080247.47024-1-colin.king@canonical.com Signed-off-by: Ulf Hansson --- drivers/memstick/core/ms_block.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/memstick/core/ms_block.c b/drivers/memstick/core/ms_block.c index 178954228631..8004dd64d09a 100644 --- a/drivers/memstick/core/ms_block.c +++ b/drivers/memstick/core/ms_block.c @@ -1223,7 +1223,7 @@ static int msb_read_boot_blocks(struct msb_data *msb) } if (be16_to_cpu(page->header.block_id) != MS_BLOCK_BOOT_ID) { - dbg("the pba at %d doesn' contain boot block ID", pba); + dbg("the pba at %d doesn't contain boot block ID", pba); continue; } From 0ac4f496a2d46d5fe320d7034c792df4a5907c46 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Tue, 11 Aug 2020 16:37:37 +0800 Subject: [PATCH 014/271] mmc: sdhci-esdhc-imx: Reset before sending tuning command for manual tuning According to IC suggestion, everytime before sending the tuning command, need to reset the usdhc, so to reset the tuning circuit, to let every tuning command work well for the manual tuning method. For standard tuning method, IC already add the reset operation in the hardware logic. Signed-off-by: Haibo Chen Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/1597135057-22272-1-git-send-email-haibo.chen@nxp.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc-imx.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index d738907a622f..0be334759c82 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -987,10 +987,20 @@ static int usdhc_execute_tuning(struct mmc_host *mmc, u32 opcode) static void esdhc_prepare_tuning(struct sdhci_host *host, u32 val) { u32 reg; + u8 sw_rst; + int ret; /* FIXME: delay a bit for card to be ready for next tuning due to errors */ mdelay(1); + /* IC suggest to reset USDHC before every tuning command */ + esdhc_clrset_le(host, 0xff, SDHCI_RESET_ALL, SDHCI_SOFTWARE_RESET); + ret = readb_poll_timeout(host->ioaddr + SDHCI_SOFTWARE_RESET, sw_rst, + !(sw_rst & SDHCI_RESET_ALL), 10, 100); + if (ret == -ETIMEDOUT) + dev_warn(mmc_dev(host->mmc), + "warning! RESET_ALL never complete before sending tuning command\n"); + reg = readl(host->ioaddr + ESDHC_MIX_CTRL); reg |= ESDHC_MIX_CTRL_EXE_TUNE | ESDHC_MIX_CTRL_SMPCLK_SEL | ESDHC_MIX_CTRL_FBCLK_SEL; From 8ebe2607965d3e2dc02029e8c7dd35fbe508ffd0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 27 Jul 2020 15:38:34 +0200 Subject: [PATCH 015/271] mmc: sdio: Check for CISTPL_VERS_1 buffer size MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Before parsing CISTPL_VERS_1 structure check that its size is at least two bytes to prevent buffer overflow. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200727133837.19086-2-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/sdio_cis.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index e0655278c5c3..3efaa9534a77 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -26,6 +26,9 @@ static int cistpl_vers_1(struct mmc_card *card, struct sdio_func *func, unsigned i, nr_strings; char **buffer, *string; + if (size < 2) + return 0; + /* Find all null-terminated (including zero length) strings in the TPLLV1_INFO field. Trailing garbage is ignored. */ buf += 2; From 78366e9cbd7892ac3d321e62c4be7a7fe79a69ed Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 27 Jul 2020 15:38:35 +0200 Subject: [PATCH 016/271] mmc: sdio: Parse CISTPL_VERS_1 major and minor revision numbers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit They should indicate compliance of standard. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200727133837.19086-3-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/sdio_cis.c | 8 ++++++++ include/linux/mmc/card.h | 2 ++ include/linux/mmc/sdio_func.h | 2 ++ 3 files changed, 12 insertions(+) diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index 3efaa9534a77..44bea5e4aeda 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -23,12 +23,16 @@ static int cistpl_vers_1(struct mmc_card *card, struct sdio_func *func, const unsigned char *buf, unsigned size) { + u8 major_rev, minor_rev; unsigned i, nr_strings; char **buffer, *string; if (size < 2) return 0; + major_rev = buf[0]; + minor_rev = buf[1]; + /* Find all null-terminated (including zero length) strings in the TPLLV1_INFO field. Trailing garbage is ignored. */ buf += 2; @@ -60,9 +64,13 @@ static int cistpl_vers_1(struct mmc_card *card, struct sdio_func *func, } if (func) { + func->major_rev = major_rev; + func->minor_rev = minor_rev; func->num_info = nr_strings; func->info = (const char**)buffer; } else { + card->major_rev = major_rev; + card->minor_rev = minor_rev; card->num_info = nr_strings; card->info = (const char**)buffer; } diff --git a/include/linux/mmc/card.h b/include/linux/mmc/card.h index 7d46411ffaa2..42df06c6b19c 100644 --- a/include/linux/mmc/card.h +++ b/include/linux/mmc/card.h @@ -297,6 +297,8 @@ struct mmc_card { struct sdio_cis cis; /* common tuple info */ struct sdio_func *sdio_func[SDIO_MAX_FUNCS]; /* SDIO functions (devices) */ struct sdio_func *sdio_single_irq; /* SDIO function when only one IRQ active */ + u8 major_rev; /* major revision number */ + u8 minor_rev; /* minor revision number */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ struct sdio_func_tuple *tuples; /* unknown common tuples */ diff --git a/include/linux/mmc/sdio_func.h b/include/linux/mmc/sdio_func.h index fa2aaab5e57a..478855b8e406 100644 --- a/include/linux/mmc/sdio_func.h +++ b/include/linux/mmc/sdio_func.h @@ -51,6 +51,8 @@ struct sdio_func { u8 *tmpbuf; /* DMA:able scratch buffer */ + u8 major_rev; /* major revision number */ + u8 minor_rev; /* minor revision number */ unsigned num_info; /* number of info strings */ const char **info; /* info strings */ From b91ec1dc5c4afb29f08afe85cdd347306044d318 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 27 Jul 2020 15:38:36 +0200 Subject: [PATCH 017/271] mmc: sdio: Extend sdio_config_attr macro and use it also for modalias MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This simplify code for generating sdio config attributes and allows easily define new sdio attributes. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200727133837.19086-4-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/sdio_bus.c | 20 ++++++-------------- 1 file changed, 6 insertions(+), 14 deletions(-) diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 3cc928282af7..2384829c8fb2 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -28,29 +28,21 @@ #define to_sdio_driver(d) container_of(d, struct sdio_driver, drv) /* show configuration fields */ -#define sdio_config_attr(field, format_string) \ +#define sdio_config_attr(field, format_string, args...) \ static ssize_t \ field##_show(struct device *dev, struct device_attribute *attr, char *buf) \ { \ struct sdio_func *func; \ \ func = dev_to_sdio_func (dev); \ - return sprintf (buf, format_string, func->field); \ + return sprintf(buf, format_string, args); \ } \ static DEVICE_ATTR_RO(field) -sdio_config_attr(class, "0x%02x\n"); -sdio_config_attr(vendor, "0x%04x\n"); -sdio_config_attr(device, "0x%04x\n"); - -static ssize_t modalias_show(struct device *dev, struct device_attribute *attr, char *buf) -{ - struct sdio_func *func = dev_to_sdio_func (dev); - - return sprintf(buf, "sdio:c%02Xv%04Xd%04X\n", - func->class, func->vendor, func->device); -} -static DEVICE_ATTR_RO(modalias); +sdio_config_attr(class, "0x%02x\n", func->class); +sdio_config_attr(vendor, "0x%04x\n", func->vendor); +sdio_config_attr(device, "0x%04x\n", func->device); +sdio_config_attr(modalias, "sdio:c%02Xv%04Xd%04X\n", func->class, func->vendor, func->device); static struct attribute *sdio_dev_attrs[] = { &dev_attr_class.attr, From b698f6abb7b3de2fd04ebbb86527ab1ea95405e0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Pali=20Roh=C3=A1r?= Date: Mon, 27 Jul 2020 15:38:37 +0200 Subject: [PATCH 018/271] mmc: sdio: Export SDIO revision and info strings to userspace MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For SDIO functions, SDIO cards and SD COMBO cards are exported revision number and info strings from CISTPL_VERS_1 structure. Revision number should indicate compliance of standard and info strings should contain product information in same format as product information for PCMCIA cards. Product information for PCMCIA cards should contain following strings in this order: Manufacturer, Product Name, Lot number, Programming Conditions. Note that not all SDIO cards export all those info strings in that order as described in PCMCIA Metaformat Specification. Signed-off-by: Pali Rohár Link: https://lore.kernel.org/r/20200727133837.19086-5-pali@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/bus.c | 12 ++++++++++++ drivers/mmc/core/sd.c | 36 +++++++++++++++++++++++++++++++++--- drivers/mmc/core/sdio.c | 24 ++++++++++++++++++++++++ drivers/mmc/core/sdio_bus.c | 34 ++++++++++++++++++++++++++++++++++ 4 files changed, 103 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/core/bus.c b/drivers/mmc/core/bus.c index 70207f11a654..c2e70b757dd1 100644 --- a/drivers/mmc/core/bus.c +++ b/drivers/mmc/core/bus.c @@ -68,6 +68,7 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { struct mmc_card *card = mmc_dev_to_card(dev); const char *type; + unsigned int i; int retval = 0; switch (card->type) { @@ -98,6 +99,17 @@ mmc_bus_uevent(struct device *dev, struct kobj_uevent_env *env) card->cis.vendor, card->cis.device); if (retval) return retval; + + retval = add_uevent_var(env, "SDIO_REVISION=%u.%u", + card->major_rev, card->minor_rev); + if (retval) + return retval; + + for (i = 0; i < card->num_info; i++) { + retval = add_uevent_var(env, "SDIO_INFO%u=%s", i+1, card->info[i]); + if (retval) + return retval; + } } /* diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 5a2210c25aa7..429ca14fa7e1 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -709,10 +709,34 @@ static DEVICE_ATTR(dsr, S_IRUGO, mmc_dsr_show, NULL); MMC_DEV_ATTR(vendor, "0x%04x\n", card->cis.vendor); MMC_DEV_ATTR(device, "0x%04x\n", card->cis.device); +MMC_DEV_ATTR(revision, "%u.%u\n", card->major_rev, card->minor_rev); + +#define sdio_info_attr(num) \ +static ssize_t info##num##_show(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + struct mmc_card *card = mmc_dev_to_card(dev); \ + \ + if (num > card->num_info) \ + return -ENODATA; \ + if (!card->info[num-1][0]) \ + return 0; \ + return sprintf(buf, "%s\n", card->info[num-1]); \ +} \ +static DEVICE_ATTR_RO(info##num) + +sdio_info_attr(1); +sdio_info_attr(2); +sdio_info_attr(3); +sdio_info_attr(4); static struct attribute *sd_std_attrs[] = { &dev_attr_vendor.attr, &dev_attr_device.attr, + &dev_attr_revision.attr, + &dev_attr_info1.attr, + &dev_attr_info2.attr, + &dev_attr_info3.attr, + &dev_attr_info4.attr, &dev_attr_cid.attr, &dev_attr_csd.attr, &dev_attr_scr.attr, @@ -738,9 +762,15 @@ static umode_t sd_std_is_visible(struct kobject *kobj, struct attribute *attr, struct device *dev = container_of(kobj, struct device, kobj); struct mmc_card *card = mmc_dev_to_card(dev); - /* CIS vendor and device ids are available only for Combo cards */ - if ((attr == &dev_attr_vendor.attr || attr == &dev_attr_device.attr) && - card->type != MMC_TYPE_SD_COMBO) + /* CIS vendor and device ids, revision and info string are available only for Combo cards */ + if ((attr == &dev_attr_vendor.attr || + attr == &dev_attr_device.attr || + attr == &dev_attr_revision.attr || + attr == &dev_attr_info1.attr || + attr == &dev_attr_info2.attr || + attr == &dev_attr_info3.attr || + attr == &dev_attr_info4.attr + ) && card->type != MMC_TYPE_SD_COMBO) return 0; return attr->mode; diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index 7b40553d3934..694a212cbe25 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -29,12 +29,36 @@ MMC_DEV_ATTR(vendor, "0x%04x\n", card->cis.vendor); MMC_DEV_ATTR(device, "0x%04x\n", card->cis.device); +MMC_DEV_ATTR(revision, "%u.%u\n", card->major_rev, card->minor_rev); MMC_DEV_ATTR(ocr, "0x%08x\n", card->ocr); MMC_DEV_ATTR(rca, "0x%04x\n", card->rca); +#define sdio_info_attr(num) \ +static ssize_t info##num##_show(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + struct mmc_card *card = mmc_dev_to_card(dev); \ + \ + if (num > card->num_info) \ + return -ENODATA; \ + if (!card->info[num-1][0]) \ + return 0; \ + return sprintf(buf, "%s\n", card->info[num-1]); \ +} \ +static DEVICE_ATTR_RO(info##num) + +sdio_info_attr(1); +sdio_info_attr(2); +sdio_info_attr(3); +sdio_info_attr(4); + static struct attribute *sdio_std_attrs[] = { &dev_attr_vendor.attr, &dev_attr_device.attr, + &dev_attr_revision.attr, + &dev_attr_info1.attr, + &dev_attr_info2.attr, + &dev_attr_info3.attr, + &dev_attr_info4.attr, &dev_attr_ocr.attr, &dev_attr_rca.attr, NULL, diff --git a/drivers/mmc/core/sdio_bus.c b/drivers/mmc/core/sdio_bus.c index 2384829c8fb2..3d709029e07c 100644 --- a/drivers/mmc/core/sdio_bus.c +++ b/drivers/mmc/core/sdio_bus.c @@ -42,12 +42,36 @@ static DEVICE_ATTR_RO(field) sdio_config_attr(class, "0x%02x\n", func->class); sdio_config_attr(vendor, "0x%04x\n", func->vendor); sdio_config_attr(device, "0x%04x\n", func->device); +sdio_config_attr(revision, "%u.%u\n", func->major_rev, func->minor_rev); sdio_config_attr(modalias, "sdio:c%02Xv%04Xd%04X\n", func->class, func->vendor, func->device); +#define sdio_info_attr(num) \ +static ssize_t info##num##_show(struct device *dev, struct device_attribute *attr, char *buf) \ +{ \ + struct sdio_func *func = dev_to_sdio_func(dev); \ + \ + if (num > func->num_info) \ + return -ENODATA; \ + if (!func->info[num-1][0]) \ + return 0; \ + return sprintf(buf, "%s\n", func->info[num-1]); \ +} \ +static DEVICE_ATTR_RO(info##num) + +sdio_info_attr(1); +sdio_info_attr(2); +sdio_info_attr(3); +sdio_info_attr(4); + static struct attribute *sdio_dev_attrs[] = { &dev_attr_class.attr, &dev_attr_vendor.attr, &dev_attr_device.attr, + &dev_attr_revision.attr, + &dev_attr_info1.attr, + &dev_attr_info2.attr, + &dev_attr_info3.attr, + &dev_attr_info4.attr, &dev_attr_modalias.attr, NULL, }; @@ -98,6 +122,7 @@ static int sdio_bus_uevent(struct device *dev, struct kobj_uevent_env *env) { struct sdio_func *func = dev_to_sdio_func(dev); + unsigned int i; if (add_uevent_var(env, "SDIO_CLASS=%02X", func->class)) @@ -107,6 +132,15 @@ sdio_bus_uevent(struct device *dev, struct kobj_uevent_env *env) "SDIO_ID=%04X:%04X", func->vendor, func->device)) return -ENOMEM; + if (add_uevent_var(env, + "SDIO_REVISION=%u.%u", func->major_rev, func->minor_rev)) + return -ENOMEM; + + for (i = 0; i < func->num_info; i++) { + if (add_uevent_var(env, "SDIO_INFO%u=%s", i+1, func->info[i])) + return -ENOMEM; + } + if (add_uevent_var(env, "MODALIAS=sdio:c%02Xv%04Xd%04X", func->class, func->vendor, func->device)) From 81e41be92ad43b268910dcae78c5a72f79f3d467 Mon Sep 17 00:00:00 2001 From: Tobias Schramm Date: Fri, 14 Aug 2020 20:50:11 +0200 Subject: [PATCH 019/271] mmc: mmc_spi: fix timeout calculation Previously the cycle timeout was converted to a microsecond value but then incorrectly treated as a nanosecond timeout. This patch changes the code to convert both the nanosecond timeout and the cycle timeout to a microsecond value and use that directly. Signed-off-by: Tobias Schramm Link: https://lore.kernel.org/r/20200814185011.3252020-1-t.schramm@manjaro.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/mmc_spi.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/mmc_spi.c b/drivers/mmc/host/mmc_spi.c index 39bb1e30c2d7..f85e0ad896a9 100644 --- a/drivers/mmc/host/mmc_spi.c +++ b/drivers/mmc/host/mmc_spi.c @@ -882,9 +882,9 @@ mmc_spi_data_do(struct mmc_spi_host *host, struct mmc_command *cmd, else clock_rate = spi->max_speed_hz; - timeout = data->timeout_ns + + timeout = data->timeout_ns / 1000 + data->timeout_clks * 1000000 / clock_rate; - timeout = usecs_to_jiffies((unsigned int)(timeout / 1000)) + 1; + timeout = usecs_to_jiffies((unsigned int)timeout) + 1; /* Handle scatterlist segments one at a time, with synch for * each 512-byte block From c92a6af6860c37dfa411049fdac45d225f0b424c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 17 Aug 2020 13:58:38 +0200 Subject: [PATCH 020/271] mmc: test: remove ambiguity in test description MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When reading the test description, I thought a correction of the xfer_size was tested, which is not the case. It is tested that the xfer_size is correct. Use 'proper xfer_size' to remove this ambiguity. Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20200817115838.2981-1-wsa@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc_test.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/core/mmc_test.c b/drivers/mmc/core/mmc_test.c index c21b3cb71775..152e7525ed33 100644 --- a/drivers/mmc/core/mmc_test.c +++ b/drivers/mmc/core/mmc_test.c @@ -2669,22 +2669,22 @@ static const struct mmc_test_case mmc_test_cases[] = { }, { - .name = "Correct xfer_size at write (start failure)", + .name = "Proper xfer_size at write (start failure)", .run = mmc_test_xfersize_write, }, { - .name = "Correct xfer_size at read (start failure)", + .name = "Proper xfer_size at read (start failure)", .run = mmc_test_xfersize_read, }, { - .name = "Correct xfer_size at write (midway failure)", + .name = "Proper xfer_size at write (midway failure)", .run = mmc_test_multi_xfersize_write, }, { - .name = "Correct xfer_size at read (midway failure)", + .name = "Proper xfer_size at read (midway failure)", .run = mmc_test_multi_xfersize_read, }, From 46f4a69ec8ed6ab9f6a6172afe50df792c8bc1b6 Mon Sep 17 00:00:00 2001 From: Adrian Hunter Date: Tue, 18 Aug 2020 13:45:08 +0300 Subject: [PATCH 021/271] mmc: sdhci: Add LTR support for some Intel BYT based controllers Some Intel BYT based host controllers support the setting of latency tolerance. Accordingly, implement the PM QoS ->set_latency_tolerance() callback. The raw register values are also exposed via debugfs. Intel EHL controllers require this support. Signed-off-by: Adrian Hunter Fixes: cb3a7d4a0aec4e ("mmc: sdhci-pci: Add support for Intel EHL") Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200818104508.7149-1-adrian.hunter@intel.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-core.c | 154 ++++++++++++++++++++++++++++++ 1 file changed, 154 insertions(+) diff --git a/drivers/mmc/host/sdhci-pci-core.c b/drivers/mmc/host/sdhci-pci-core.c index af413805bbf1..d0c8d39d5dbd 100644 --- a/drivers/mmc/host/sdhci-pci-core.c +++ b/drivers/mmc/host/sdhci-pci-core.c @@ -24,6 +24,8 @@ #include #include #include +#include +#include #include #include #include @@ -516,6 +518,8 @@ struct intel_host { bool rpm_retune_ok; u32 glk_rx_ctrl1; u32 glk_tun_val; + u32 active_ltr; + u32 idle_ltr; }; static const guid_t intel_dsm_guid = @@ -760,6 +764,108 @@ static int intel_execute_tuning(struct mmc_host *mmc, u32 opcode) return 0; } +#define INTEL_ACTIVELTR 0x804 +#define INTEL_IDLELTR 0x808 + +#define INTEL_LTR_REQ BIT(15) +#define INTEL_LTR_SCALE_MASK GENMASK(11, 10) +#define INTEL_LTR_SCALE_1US (2 << 10) +#define INTEL_LTR_SCALE_32US (3 << 10) +#define INTEL_LTR_VALUE_MASK GENMASK(9, 0) + +static void intel_cache_ltr(struct sdhci_pci_slot *slot) +{ + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct sdhci_host *host = slot->host; + + intel_host->active_ltr = readl(host->ioaddr + INTEL_ACTIVELTR); + intel_host->idle_ltr = readl(host->ioaddr + INTEL_IDLELTR); +} + +static void intel_ltr_set(struct device *dev, s32 val) +{ + struct sdhci_pci_chip *chip = dev_get_drvdata(dev); + struct sdhci_pci_slot *slot = chip->slots[0]; + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct sdhci_host *host = slot->host; + u32 ltr; + + pm_runtime_get_sync(dev); + + /* + * Program latency tolerance (LTR) accordingly what has been asked + * by the PM QoS layer or disable it in case we were passed + * negative value or PM_QOS_LATENCY_ANY. + */ + ltr = readl(host->ioaddr + INTEL_ACTIVELTR); + + if (val == PM_QOS_LATENCY_ANY || val < 0) { + ltr &= ~INTEL_LTR_REQ; + } else { + ltr |= INTEL_LTR_REQ; + ltr &= ~INTEL_LTR_SCALE_MASK; + ltr &= ~INTEL_LTR_VALUE_MASK; + + if (val > INTEL_LTR_VALUE_MASK) { + val >>= 5; + if (val > INTEL_LTR_VALUE_MASK) + val = INTEL_LTR_VALUE_MASK; + ltr |= INTEL_LTR_SCALE_32US | val; + } else { + ltr |= INTEL_LTR_SCALE_1US | val; + } + } + + if (ltr == intel_host->active_ltr) + goto out; + + writel(ltr, host->ioaddr + INTEL_ACTIVELTR); + writel(ltr, host->ioaddr + INTEL_IDLELTR); + + /* Cache the values into lpss structure */ + intel_cache_ltr(slot); +out: + pm_runtime_put_autosuspend(dev); +} + +static bool intel_use_ltr(struct sdhci_pci_chip *chip) +{ + switch (chip->pdev->device) { + case PCI_DEVICE_ID_INTEL_BYT_EMMC: + case PCI_DEVICE_ID_INTEL_BYT_EMMC2: + case PCI_DEVICE_ID_INTEL_BYT_SDIO: + case PCI_DEVICE_ID_INTEL_BYT_SD: + case PCI_DEVICE_ID_INTEL_BSW_EMMC: + case PCI_DEVICE_ID_INTEL_BSW_SDIO: + case PCI_DEVICE_ID_INTEL_BSW_SD: + return false; + default: + return true; + } +} + +static void intel_ltr_expose(struct sdhci_pci_chip *chip) +{ + struct device *dev = &chip->pdev->dev; + + if (!intel_use_ltr(chip)) + return; + + dev->power.set_latency_tolerance = intel_ltr_set; + dev_pm_qos_expose_latency_tolerance(dev); +} + +static void intel_ltr_hide(struct sdhci_pci_chip *chip) +{ + struct device *dev = &chip->pdev->dev; + + if (!intel_use_ltr(chip)) + return; + + dev_pm_qos_hide_latency_tolerance(dev); + dev->power.set_latency_tolerance = NULL; +} + static void byt_probe_slot(struct sdhci_pci_slot *slot) { struct mmc_host_ops *ops = &slot->host->mmc_host_ops; @@ -774,6 +880,43 @@ static void byt_probe_slot(struct sdhci_pci_slot *slot) ops->start_signal_voltage_switch = intel_start_signal_voltage_switch; device_property_read_u32(dev, "max-frequency", &mmc->f_max); + + if (!mmc->slotno) { + slot->chip->slots[mmc->slotno] = slot; + intel_ltr_expose(slot->chip); + } +} + +static void byt_add_debugfs(struct sdhci_pci_slot *slot) +{ + struct intel_host *intel_host = sdhci_pci_priv(slot); + struct mmc_host *mmc = slot->host->mmc; + struct dentry *dir = mmc->debugfs_root; + + if (!intel_use_ltr(slot->chip)) + return; + + debugfs_create_x32("active_ltr", 0444, dir, &intel_host->active_ltr); + debugfs_create_x32("idle_ltr", 0444, dir, &intel_host->idle_ltr); + + intel_cache_ltr(slot); +} + +static int byt_add_host(struct sdhci_pci_slot *slot) +{ + int ret = sdhci_add_host(slot->host); + + if (!ret) + byt_add_debugfs(slot); + return ret; +} + +static void byt_remove_slot(struct sdhci_pci_slot *slot, int dead) +{ + struct mmc_host *mmc = slot->host->mmc; + + if (!mmc->slotno) + intel_ltr_hide(slot->chip); } static int byt_emmc_probe_slot(struct sdhci_pci_slot *slot) @@ -854,6 +997,8 @@ static int glk_emmc_add_host(struct sdhci_pci_slot *slot) if (ret) goto cleanup; + byt_add_debugfs(slot); + return 0; cleanup: @@ -1031,6 +1176,8 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_emmc = { #endif .allow_runtime_pm = true, .probe_slot = byt_emmc_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .quirks = SDHCI_QUIRK_NO_ENDATTR_IN_NOPDESC | SDHCI_QUIRK_NO_LED, .quirks2 = SDHCI_QUIRK2_PRESET_VALUE_BROKEN | @@ -1044,6 +1191,7 @@ static const struct sdhci_pci_fixes sdhci_intel_glk_emmc = { .allow_runtime_pm = true, .probe_slot = glk_emmc_probe_slot, .add_host = glk_emmc_add_host, + .remove_slot = byt_remove_slot, #ifdef CONFIG_PM_SLEEP .suspend = sdhci_cqhci_suspend, .resume = sdhci_cqhci_resume, @@ -1074,6 +1222,8 @@ static const struct sdhci_pci_fixes sdhci_ni_byt_sdio = { SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .allow_runtime_pm = true, .probe_slot = ni_byt_sdio_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .ops = &sdhci_intel_byt_ops, .priv_size = sizeof(struct intel_host), }; @@ -1091,6 +1241,8 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_sdio = { SDHCI_QUIRK2_PRESET_VALUE_BROKEN, .allow_runtime_pm = true, .probe_slot = byt_sdio_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .ops = &sdhci_intel_byt_ops, .priv_size = sizeof(struct intel_host), }; @@ -1110,6 +1262,8 @@ static const struct sdhci_pci_fixes sdhci_intel_byt_sd = { .allow_runtime_pm = true, .own_cd_for_runtime_pm = true, .probe_slot = byt_sd_probe_slot, + .add_host = byt_add_host, + .remove_slot = byt_remove_slot, .ops = &sdhci_intel_byt_ops, .priv_size = sizeof(struct intel_host), }; From 6932794192f63c5e105eade19fba26b8ef29ec79 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 21 Aug 2020 08:35:33 +0200 Subject: [PATCH 022/271] mmc: core: Improve documentation of MMC_CAP_HW_RESET MMC_CAP_HW_RESET means that the controller is capable of resetting the eMMC device via RST_n signal, not a reset of the controller. Two drivers got this wrong, so let's make it more clear. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200821063533.3771-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- include/linux/mmc/host.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index c5b6e97cb21a..799e23b0a23c 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -163,6 +163,7 @@ struct mmc_host_ops { int (*select_drive_strength)(struct mmc_card *card, unsigned int max_dtr, int host_drv, int card_drv, int *drv_type); + /* Reset the eMMC card via RST_n */ void (*hw_reset)(struct mmc_host *host); void (*card_event)(struct mmc_host *host); @@ -346,7 +347,7 @@ struct mmc_host { #define MMC_CAP_CD_WAKE (1 << 28) /* Enable card detect wake */ #define MMC_CAP_CMD_DURING_TFR (1 << 29) /* Commands during data transfer */ #define MMC_CAP_CMD23 (1 << 30) /* CMD23 supported. */ -#define MMC_CAP_HW_RESET (1 << 31) /* Hardware reset */ +#define MMC_CAP_HW_RESET (1 << 31) /* Reset the eMMC card via RST_n */ u32 caps2; /* More host capabilities */ From 87d7ad089b318b4f319bf57f1daa64eb6d1d10ad Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Sat, 22 Aug 2020 11:45:28 +0530 Subject: [PATCH 023/271] mmc: via-sdmmc: Fix data race bug via_save_pcictrlreg() should be called with host->lock held as it writes to pm_pcictrl_reg, otherwise there can be a race condition between via_sd_suspend() and via_sdc_card_detect(). The same pattern is used in the function via_reset_pcictrl() as well, where via_save_pcictrlreg() is called with host->lock held. Found by Linux Driver Verification project (linuxtesting.org). Signed-off-by: Madhuparna Bhowmik Link: https://lore.kernel.org/r/20200822061528.7035-1-madhuparnabhowmik10@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/via-sdmmc.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/mmc/host/via-sdmmc.c b/drivers/mmc/host/via-sdmmc.c index 49dab9f42b6d..9b755ea0fa03 100644 --- a/drivers/mmc/host/via-sdmmc.c +++ b/drivers/mmc/host/via-sdmmc.c @@ -1257,11 +1257,14 @@ static void __maybe_unused via_init_sdc_pm(struct via_crdr_mmc_host *host) static int __maybe_unused via_sd_suspend(struct device *dev) { struct via_crdr_mmc_host *host; + unsigned long flags; host = dev_get_drvdata(dev); + spin_lock_irqsave(&host->lock, flags); via_save_pcictrlreg(host); via_save_sdcreg(host); + spin_unlock_irqrestore(&host->lock, flags); device_wakeup_enable(dev); From 061e4fbf6a64b6f870d95438932119a22a4b6321 Mon Sep 17 00:00:00 2001 From: Lars Povlsen Date: Tue, 25 Aug 2020 10:13:55 +0200 Subject: [PATCH 024/271] dt-bindings: mmc: Add Sparx5 SDHCI controller bindings The Sparx5 SDHCI controller is based on the Designware controller IP. Signed-off-by: Lars Povlsen Link: https://lore.kernel.org/r/20200825081357.32354-2-lars.povlsen@microchip.com Link: https://lore.kernel.org/r/20200826130106.22889-1-lars.povlsen@microchip.com Signed-off-by: Ulf Hansson --- .../mmc/microchip,dw-sparx5-sdhci.yaml | 65 +++++++++++++++++++ 1 file changed, 65 insertions(+) create mode 100644 Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml diff --git a/Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml b/Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml new file mode 100644 index 000000000000..55883290543b --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/microchip,dw-sparx5-sdhci.yaml @@ -0,0 +1,65 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +%YAML 1.2 +--- +$id: http://devicetree.org/schemas/mmc/microchip,dw-sparx5-sdhci.yaml# +$schema: http://devicetree.org/meta-schemas/core.yaml# + +title: Microchip Sparx5 Mobile Storage Host Controller Binding + +allOf: + - $ref: "mmc-controller.yaml" + +maintainers: + - Lars Povlsen + +# Everything else is described in the common file +properties: + compatible: + const: microchip,dw-sparx5-sdhci + + reg: + maxItems: 1 + + interrupts: + maxItems: 1 + + clocks: + maxItems: 1 + description: + Handle to "core" clock for the sdhci controller. + + clock-names: + items: + - const: core + + microchip,clock-delay: + description: Delay clock to card to meet setup time requirements. + Each step increase by 1.25ns. + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 1 + maximum: 15 + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + +examples: + - | + #include + #include + sdhci0: mmc@600800000 { + compatible = "microchip,dw-sparx5-sdhci"; + reg = <0x00800000 0x1000>; + pinctrl-0 = <&emmc_pins>; + pinctrl-names = "default"; + clocks = <&clks CLK_ID_AUX1>; + clock-names = "core"; + assigned-clocks = <&clks CLK_ID_AUX1>; + assigned-clock-rates = <800000000>; + interrupts = ; + bus-width = <8>; + microchip,clock-delay = <10>; + }; From 43611afb6c178d4f07461fd4d4832e0da3762386 Mon Sep 17 00:00:00 2001 From: Lars Povlsen Date: Tue, 25 Aug 2020 10:13:56 +0200 Subject: [PATCH 025/271] mmc: sdhci-of-sparx5: Add Sparx5 SoC eMMC driver This adds the eMMC driver for the Sparx5 SoC. It is based upon the designware IP, but requires some extra initialization and quirks. Signed-off-by: Lars Povlsen Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/20200825081357.32354-3-lars.povlsen@microchip.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 12 ++ drivers/mmc/host/Makefile | 1 + drivers/mmc/host/sdhci-of-sparx5.c | 269 +++++++++++++++++++++++++++++ 3 files changed, 282 insertions(+) create mode 100644 drivers/mmc/host/sdhci-of-sparx5.c diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 10adaf91faaa..0ef1992ece3b 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -213,6 +213,18 @@ config MMC_SDHCI_OF_DWCMSHC If you have a controller with this interface, say Y or M here. If unsure, say N. +config MMC_SDHCI_OF_SPARX5 + tristate "SDHCI OF support for the MCHP Sparx5 SoC" + depends on MMC_SDHCI_PLTFM + depends on ARCH_SPARX5 + help + This selects the Secure Digital Host Controller Interface (SDHCI) + found in the MCHP Sparx5 SoC. + + If you have a Sparx5 SoC with this interface, say Y or M here. + + If unsure, say N. + config MMC_SDHCI_CADENCE tristate "SDHCI support for the Cadence SD/SDIO/eMMC controller" depends on MMC_SDHCI_PLTFM diff --git a/drivers/mmc/host/Makefile b/drivers/mmc/host/Makefile index 4d5bcb0144a0..451c25fc2c69 100644 --- a/drivers/mmc/host/Makefile +++ b/drivers/mmc/host/Makefile @@ -94,6 +94,7 @@ obj-$(CONFIG_MMC_SDHCI_OF_AT91) += sdhci-of-at91.o obj-$(CONFIG_MMC_SDHCI_OF_ESDHC) += sdhci-of-esdhc.o obj-$(CONFIG_MMC_SDHCI_OF_HLWD) += sdhci-of-hlwd.o obj-$(CONFIG_MMC_SDHCI_OF_DWCMSHC) += sdhci-of-dwcmshc.o +obj-$(CONFIG_MMC_SDHCI_OF_SPARX5) += sdhci-of-sparx5.o obj-$(CONFIG_MMC_SDHCI_BCM_KONA) += sdhci-bcm-kona.o obj-$(CONFIG_MMC_SDHCI_IPROC) += sdhci-iproc.o obj-$(CONFIG_MMC_SDHCI_MSM) += sdhci-msm.o diff --git a/drivers/mmc/host/sdhci-of-sparx5.c b/drivers/mmc/host/sdhci-of-sparx5.c new file mode 100644 index 000000000000..14b5dad6575a --- /dev/null +++ b/drivers/mmc/host/sdhci-of-sparx5.c @@ -0,0 +1,269 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * drivers/mmc/host/sdhci-of-sparx5.c + * + * MCHP Sparx5 SoC Secure Digital Host Controller Interface. + * + * Copyright (c) 2019 Microchip Inc. + * + * Author: Lars Povlsen + */ + +#include +#include +#include +#include +#include +#include +#include + +#include "sdhci-pltfm.h" + +#define CPU_REGS_GENERAL_CTRL (0x22 * 4) +#define MSHC_DLY_CC_MASK GENMASK(16, 13) +#define MSHC_DLY_CC_SHIFT 13 +#define MSHC_DLY_CC_MAX 15 + +#define CPU_REGS_PROC_CTRL (0x2C * 4) +#define ACP_CACHE_FORCE_ENA BIT(4) +#define ACP_AWCACHE BIT(3) +#define ACP_ARCACHE BIT(2) +#define ACP_CACHE_MASK (ACP_CACHE_FORCE_ENA|ACP_AWCACHE|ACP_ARCACHE) + +#define MSHC2_VERSION 0x500 /* Off 0x140, reg 0x0 */ +#define MSHC2_TYPE 0x504 /* Off 0x140, reg 0x1 */ +#define MSHC2_EMMC_CTRL 0x52c /* Off 0x140, reg 0xB */ +#define MSHC2_EMMC_CTRL_EMMC_RST_N BIT(2) +#define MSHC2_EMMC_CTRL_IS_EMMC BIT(0) + +struct sdhci_sparx5_data { + struct sdhci_host *host; + struct regmap *cpu_ctrl; + int delay_clock; +}; + +#define BOUNDARY_OK(addr, len) \ + ((addr | (SZ_128M - 1)) == ((addr + len - 1) | (SZ_128M - 1))) + +/* + * If DMA addr spans 128MB boundary, we split the DMA transfer into two + * so that each DMA transfer doesn't exceed the boundary. + */ +static void sdhci_sparx5_adma_write_desc(struct sdhci_host *host, void **desc, + dma_addr_t addr, int len, + unsigned int cmd) +{ + int tmplen, offset; + + if (likely(!len || BOUNDARY_OK(addr, len))) { + sdhci_adma_write_desc(host, desc, addr, len, cmd); + return; + } + + pr_debug("%s: write_desc: splitting dma len %d, offset 0x%0llx\n", + mmc_hostname(host->mmc), len, addr); + + offset = addr & (SZ_128M - 1); + tmplen = SZ_128M - offset; + sdhci_adma_write_desc(host, desc, addr, tmplen, cmd); + + addr += tmplen; + len -= tmplen; + sdhci_adma_write_desc(host, desc, addr, len, cmd); +} + +static void sparx5_set_cacheable(struct sdhci_host *host, u32 value) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_sparx5_data *sdhci_sparx5 = sdhci_pltfm_priv(pltfm_host); + + pr_debug("%s: Set Cacheable = 0x%x\n", mmc_hostname(host->mmc), value); + + /* Update ACP caching attributes in HW */ + regmap_update_bits(sdhci_sparx5->cpu_ctrl, + CPU_REGS_PROC_CTRL, ACP_CACHE_MASK, value); +} + +static void sparx5_set_delay(struct sdhci_host *host, u8 value) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_sparx5_data *sdhci_sparx5 = sdhci_pltfm_priv(pltfm_host); + + pr_debug("%s: Set DLY_CC = %u\n", mmc_hostname(host->mmc), value); + + /* Update DLY_CC in HW */ + regmap_update_bits(sdhci_sparx5->cpu_ctrl, + CPU_REGS_GENERAL_CTRL, + MSHC_DLY_CC_MASK, + (value << MSHC_DLY_CC_SHIFT)); +} + +static void sdhci_sparx5_set_emmc(struct sdhci_host *host) +{ + if (!mmc_card_is_removable(host->mmc)) { + u8 value; + + value = sdhci_readb(host, MSHC2_EMMC_CTRL); + if (!(value & MSHC2_EMMC_CTRL_IS_EMMC)) { + value |= MSHC2_EMMC_CTRL_IS_EMMC; + pr_debug("%s: Set EMMC_CTRL: 0x%08x\n", + mmc_hostname(host->mmc), value); + sdhci_writeb(host, value, MSHC2_EMMC_CTRL); + } + } +} + +static void sdhci_sparx5_reset_emmc(struct sdhci_host *host) +{ + u8 value; + + pr_debug("%s: Toggle EMMC_CTRL.EMMC_RST_N\n", mmc_hostname(host->mmc)); + value = sdhci_readb(host, MSHC2_EMMC_CTRL) & + ~MSHC2_EMMC_CTRL_EMMC_RST_N; + sdhci_writeb(host, value, MSHC2_EMMC_CTRL); + /* For eMMC, minimum is 1us but give it 10us for good measure */ + usleep_range(10, 20); + sdhci_writeb(host, value | MSHC2_EMMC_CTRL_EMMC_RST_N, + MSHC2_EMMC_CTRL); + /* For eMMC, minimum is 200us but give it 300us for good measure */ + usleep_range(300, 400); +} + +static void sdhci_sparx5_reset(struct sdhci_host *host, u8 mask) +{ + pr_debug("%s: *** RESET: mask %d\n", mmc_hostname(host->mmc), mask); + + sdhci_reset(host, mask); + + /* Be sure CARD_IS_EMMC stays set */ + sdhci_sparx5_set_emmc(host); +} + +static const struct sdhci_ops sdhci_sparx5_ops = { + .set_clock = sdhci_set_clock, + .set_bus_width = sdhci_set_bus_width, + .set_uhs_signaling = sdhci_set_uhs_signaling, + .get_max_clock = sdhci_pltfm_clk_get_max_clock, + .reset = sdhci_sparx5_reset, + .adma_write_desc = sdhci_sparx5_adma_write_desc, +}; + +static const struct sdhci_pltfm_data sdhci_sparx5_pdata = { + .quirks = 0, + .quirks2 = SDHCI_QUIRK2_HOST_NO_CMD23 | /* Controller issue */ + SDHCI_QUIRK2_NO_1_8_V, /* No sdr104, ddr50, etc */ + .ops = &sdhci_sparx5_ops, +}; + +static int sdhci_sparx5_probe(struct platform_device *pdev) +{ + int ret; + const char *syscon = "microchip,sparx5-cpu-syscon"; + struct sdhci_host *host; + struct sdhci_pltfm_host *pltfm_host; + struct sdhci_sparx5_data *sdhci_sparx5; + struct device_node *np = pdev->dev.of_node; + u32 value; + u32 extra; + + host = sdhci_pltfm_init(pdev, &sdhci_sparx5_pdata, + sizeof(*sdhci_sparx5)); + + if (IS_ERR(host)) + return PTR_ERR(host); + + /* + * extra adma table cnt for cross 128M boundary handling. + */ + extra = DIV_ROUND_UP_ULL(dma_get_required_mask(&pdev->dev), SZ_128M); + if (extra > SDHCI_MAX_SEGS) + extra = SDHCI_MAX_SEGS; + host->adma_table_cnt += extra; + + pltfm_host = sdhci_priv(host); + sdhci_sparx5 = sdhci_pltfm_priv(pltfm_host); + sdhci_sparx5->host = host; + + pltfm_host->clk = devm_clk_get(&pdev->dev, "core"); + if (IS_ERR(pltfm_host->clk)) { + ret = PTR_ERR(pltfm_host->clk); + dev_err(&pdev->dev, "failed to get core clk: %d\n", ret); + goto free_pltfm; + } + ret = clk_prepare_enable(pltfm_host->clk); + if (ret) + goto free_pltfm; + + if (!of_property_read_u32(np, "microchip,clock-delay", &value) && + (value > 0 && value <= MSHC_DLY_CC_MAX)) + sdhci_sparx5->delay_clock = value; + + sdhci_get_of_property(pdev); + + ret = mmc_of_parse(host->mmc); + if (ret) + goto err_clk; + + sdhci_sparx5->cpu_ctrl = syscon_regmap_lookup_by_compatible(syscon); + if (IS_ERR(sdhci_sparx5->cpu_ctrl)) { + dev_err(&pdev->dev, "No CPU syscon regmap !\n"); + ret = PTR_ERR(sdhci_sparx5->cpu_ctrl); + goto err_clk; + } + + if (sdhci_sparx5->delay_clock >= 0) + sparx5_set_delay(host, sdhci_sparx5->delay_clock); + + if (!mmc_card_is_removable(host->mmc)) { + /* Do a HW reset of eMMC card */ + sdhci_sparx5_reset_emmc(host); + /* Update EMMC_CTRL */ + sdhci_sparx5_set_emmc(host); + /* If eMMC, disable SD and SDIO */ + host->mmc->caps2 |= (MMC_CAP2_NO_SDIO|MMC_CAP2_NO_SD); + } + + ret = sdhci_add_host(host); + if (ret) + goto err_clk; + + /* Set AXI bus master to use un-cached access (for DMA) */ + if (host->flags & (SDHCI_USE_SDMA | SDHCI_USE_ADMA) && + IS_ENABLED(CONFIG_DMA_DECLARE_COHERENT)) + sparx5_set_cacheable(host, ACP_CACHE_FORCE_ENA); + + pr_debug("%s: SDHC version: 0x%08x\n", + mmc_hostname(host->mmc), sdhci_readl(host, MSHC2_VERSION)); + pr_debug("%s: SDHC type: 0x%08x\n", + mmc_hostname(host->mmc), sdhci_readl(host, MSHC2_TYPE)); + + return ret; + +err_clk: + clk_disable_unprepare(pltfm_host->clk); +free_pltfm: + sdhci_pltfm_free(pdev); + return ret; +} + +static const struct of_device_id sdhci_sparx5_of_match[] = { + { .compatible = "microchip,dw-sparx5-sdhci" }, + { } +}; +MODULE_DEVICE_TABLE(of, sdhci_sparx5_of_match); + +static struct platform_driver sdhci_sparx5_driver = { + .driver = { + .name = "sdhci-sparx5", + .of_match_table = sdhci_sparx5_of_match, + .pm = &sdhci_pltfm_pmops, + }, + .probe = sdhci_sparx5_probe, + .remove = sdhci_pltfm_unregister, +}; + +module_platform_driver(sdhci_sparx5_driver); + +MODULE_DESCRIPTION("Sparx5 SDHCI OF driver"); +MODULE_AUTHOR("Lars Povlsen "); +MODULE_LICENSE("GPL v2"); From e449d983811211592dad6f5ef8b25d5549be5e8e Mon Sep 17 00:00:00 2001 From: Tian Tao Date: Mon, 24 Aug 2020 09:02:36 +0800 Subject: [PATCH 026/271] mmc: sd: Use kobj_to_dev() instead of container_of() Use kobj_to_dev() instead of container_of() Signed-off-by: Tian Tao Link: https://lore.kernel.org/r/1598230956-58523-1-git-send-email-tiantao6@hisilicon.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/sd.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index 429ca14fa7e1..6f054c449d46 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -759,7 +759,7 @@ static struct attribute *sd_std_attrs[] = { static umode_t sd_std_is_visible(struct kobject *kobj, struct attribute *attr, int index) { - struct device *dev = container_of(kobj, struct device, kobj); + struct device *dev = kobj_to_dev(kobj); struct mmc_card *card = mmc_dev_to_card(dev); /* CIS vendor and device ids, revision and info string are available only for Combo cards */ From 9e70ff99ee871ec85e17da7c9d05cb5fcc306050 Mon Sep 17 00:00:00 2001 From: Haibo Chen Date: Mon, 24 Aug 2020 18:45:14 +0800 Subject: [PATCH 027/271] mmc: sdhci-esdhc-imx: remove unused code Value assigned to a variable(err) is never used, so remove it. Signed-off-by: Haibo Chen Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/1598265914-23606-1-git-send-email-haibo.chen@nxp.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc-imx.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 0be334759c82..a2eb36c9b1da 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1653,10 +1653,8 @@ static int sdhci_esdhc_imx_probe(struct platform_device *pdev) goto disable_ipg_clk; imx_data->pinctrl = devm_pinctrl_get(&pdev->dev); - if (IS_ERR(imx_data->pinctrl)) { - err = PTR_ERR(imx_data->pinctrl); + if (IS_ERR(imx_data->pinctrl)) dev_warn(mmc_dev(host->mmc), "could not get pinctrl\n"); - } if (esdhc_is_usdhc(imx_data)) { host->quirks2 |= SDHCI_QUIRK2_PRESET_VALUE_BROKEN; From d425e42d600849ed81bca4e407a09f2555405ec7 Mon Sep 17 00:00:00 2001 From: Alex Dewar Date: Mon, 24 Aug 2020 18:18:50 +0100 Subject: [PATCH 028/271] mmc: sdhci-msm: enable compile-testing on !ARM There seems to be no particular reason to only test for ARM, so allow for build-testing on other platforms to increase coverage. Build-tested on x86 with allyesconfig. Signed-off-by: Alex Dewar Link: https://lore.kernel.org/r/20200824171854.406157-1-alex.dewar90@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 0ef1992ece3b..577b82dd13a0 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -537,7 +537,7 @@ config MMC_ATMELMCI config MMC_SDHCI_MSM tristate "Qualcomm SDHCI Controller Support" - depends on ARCH_QCOM || (ARM && COMPILE_TEST) + depends on ARCH_QCOM || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI From 7ca0f166f5b211f89fde9cf02357403e6f9715bc Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Tue, 25 Aug 2020 22:30:15 +0530 Subject: [PATCH 029/271] mmc: sdhci_am654: Add workaround for card detect debounce timer There is a one time delay because of a card detect debounce timer in the controller IP. This timer runs as soon as power is applied to the module regardless of whether a card is present or not and any writes to SDHCI_POWER_ON will return 0 before it expires. This timeout has been measured to be about 1 second in am654x and j721e. Write-and-read-back in a loop on SDHCI_POWER_ON for a maximum of 1.5 seconds to make sure that the controller actually powers on. Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20200825170015.32285-1-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 24 ++++++++++++++++++++++++ 1 file changed, 24 insertions(+) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 0f3ed716d3d2..c5f47197d177 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -6,6 +6,7 @@ * */ #include +#include #include #include #include @@ -272,9 +273,19 @@ static void sdhci_j721e_4bit_set_clock(struct sdhci_host *host, sdhci_set_clock(host, clock); } +static u8 sdhci_am654_write_power_on(struct sdhci_host *host, u8 val, int reg) +{ + writeb(val, host->ioaddr + reg); + usleep_range(1000, 10000); + return readb(host->ioaddr + reg); +} + +#define MAX_POWER_ON_TIMEOUT 1500000 /* us */ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) { unsigned char timing = host->mmc->ios.timing; + u8 pwr; + int ret; if (reg == SDHCI_HOST_CONTROL) { switch (timing) { @@ -291,6 +302,19 @@ static void sdhci_am654_write_b(struct sdhci_host *host, u8 val, int reg) } writeb(val, host->ioaddr + reg); + if (reg == SDHCI_POWER_CONTROL && (val & SDHCI_POWER_ON)) { + /* + * Power on will not happen until the card detect debounce + * timer expires. Wait at least 1.5 seconds for the power on + * bit to be set + */ + ret = read_poll_timeout(sdhci_am654_write_power_on, pwr, + pwr & SDHCI_POWER_ON, 0, + MAX_POWER_ON_TIMEOUT, false, host, val, + reg); + if (ret) + dev_warn(mmc_dev(host->mmc), "Power on failed\n"); + } } static int sdhci_am654_execute_tuning(struct mmc_host *mmc, u32 opcode) From 6e7d4de10890545b00f7d7c33310f1555963d592 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 20 Aug 2020 15:25:33 +0200 Subject: [PATCH 030/271] mmc: renesas_sdhi: move wrong 'hw_reset' to 'reset' This driver got the usage of 'hw_reset' wrong and missed that it is used to reset the remote HW (card) only, not the local one (controller). Move everything to the proper 'reset' callback. Also, add the generic reset code from TMIO, so we will ensure the same behaviour (it will get refactored away in a later patch). This also means we need to drop MMC_CAP_HW_RESET because this is currently not supported by our hardware. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200820132538.24758-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 19 +++++++++++++------ 1 file changed, 13 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 904f5237d8f7..eced87845784 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -631,11 +631,20 @@ static bool renesas_sdhi_check_scc_error(struct tmio_mmc_host *host) return renesas_sdhi_manual_correction(host, use_4tap); } -static void renesas_sdhi_hw_reset(struct tmio_mmc_host *host) +static void renesas_sdhi_reset(struct tmio_mmc_host *host) { - struct renesas_sdhi *priv; + struct renesas_sdhi *priv = host_to_priv(host); - priv = host_to_priv(host); + /* FIXME - should we set stop clock reg here */ + sd_ctrl_write16(host, CTL_RESET_SD, 0x0000); + usleep_range(10000, 11000); + sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); + usleep_range(10000, 11000); + + if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { + sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); + sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); + } renesas_sdhi_reset_scc(host, priv); renesas_sdhi_reset_hs400_mode(host, priv); @@ -862,11 +871,9 @@ int renesas_sdhi_probe(struct platform_device *pdev, renesas_sdhi_start_signal_voltage_switch; host->sdcard_irq_setbit_mask = TMIO_STAT_ALWAYS_SET_27; - /* SDR and HS200/400 registers requires HW reset */ if (of_data && of_data->scc_offset) { priv->scc_ctl = host->ctl + of_data->scc_offset; - host->mmc->caps |= MMC_CAP_HW_RESET; - host->hw_reset = renesas_sdhi_hw_reset; + host->reset = renesas_sdhi_reset; } } From 34e3211e5492756d4e59ef54f9ab1750ec7050ec Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 20 Aug 2020 15:25:34 +0200 Subject: [PATCH 031/271] Revert "mmc: tmio: fix reset operation" This reverts commit a87852c6b8827b7fece78ae57d871d56e4348e30. It did fix the issue, but was building on top of already wrong assumptions. The driver missed that 'hw_reset' was only for resetting remote HW (card) and not for the IP core. Since we fixed that in a previous patch, we can now remove this patch to make it clear that 'reset' is for resetting the IP core only. Also, cancelling DMA will only be called when actually needed again. It will also allow for further cleanups and better readability. Note that in addition to the revert, the call in 'tmio_mmc_execute_tuning' will be converted, too, to maintain the current behaviour. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200820132538.24758-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc_core.c | 29 +++++++++++++---------------- 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 946fb013c610..125530e7bd17 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -178,18 +178,6 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host) } } -static void tmio_mmc_hw_reset(struct mmc_host *mmc) -{ - struct tmio_mmc_host *host = mmc_priv(mmc); - - host->reset(host); - - tmio_mmc_abort_dma(host); - - if (host->hw_reset) - host->hw_reset(host); -} - static void tmio_mmc_reset_work(struct work_struct *work) { struct tmio_mmc_host *host = container_of(work, struct tmio_mmc_host, @@ -228,11 +216,12 @@ static void tmio_mmc_reset_work(struct work_struct *work) spin_unlock_irqrestore(&host->lock, flags); - tmio_mmc_hw_reset(host->mmc); + host->reset(host); /* Ready for new calls */ host->mrq = NULL; + tmio_mmc_abort_dma(host); mmc_request_done(host->mmc, mrq); } @@ -720,6 +709,14 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host, return 0; } +static void tmio_mmc_hw_reset(struct mmc_host *mmc) +{ + struct tmio_mmc_host *host = mmc_priv(mmc); + + if (host->hw_reset) + host->hw_reset(host); +} + static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) { struct tmio_mmc_host *host = mmc_priv(mmc); @@ -732,7 +729,7 @@ static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) if (ret < 0) { dev_warn(&host->pdev->dev, "Tuning procedure failed\n"); - tmio_mmc_hw_reset(mmc); + host->reset(host); } return ret; @@ -1180,7 +1177,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) _host->sdio_irq_mask = TMIO_SDIO_MASK_ALL; _host->set_clock(_host, 0); - tmio_mmc_hw_reset(mmc); + _host->reset(_host); _host->sdcard_irq_mask = sd_ctrl_read16_and_16_as_32(_host, CTL_IRQ_MASK); tmio_mmc_disable_mmc_irqs(_host, TMIO_MASK_ALL); @@ -1283,7 +1280,7 @@ int tmio_mmc_host_runtime_resume(struct device *dev) struct tmio_mmc_host *host = dev_get_drvdata(dev); tmio_mmc_clk_enable(host); - tmio_mmc_hw_reset(host->mmc); + host->reset(host); if (host->clk_cache) host->set_clock(host, host->clk_cache); From a8c83064e73382d6bb069bd51c56074d1e4f4eec Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 20 Aug 2020 15:25:35 +0200 Subject: [PATCH 032/271] mmc: tmio: remove indirection of 'hw_reset' callback After Yamada-san's refactorization introducing 'tmio_mmc_host_alloc', we can populate mmc_ops directly and don't need a layer inbetween. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200820132538.24758-4-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc.h | 1 - drivers/mmc/host/tmio_mmc_core.c | 9 --------- drivers/mmc/host/uniphier-sd.c | 5 +++-- 3 files changed, 3 insertions(+), 12 deletions(-) diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index 0a4f36500add..dac717041c2d 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -178,7 +178,6 @@ struct tmio_mmc_host { unsigned int direction, int blk_size); int (*write16_hook)(struct tmio_mmc_host *host, int addr); void (*reset)(struct tmio_mmc_host *host); - void (*hw_reset)(struct tmio_mmc_host *host); bool (*check_retune)(struct tmio_mmc_host *host); /* diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 125530e7bd17..a4bd0a0a305c 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -709,14 +709,6 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host, return 0; } -static void tmio_mmc_hw_reset(struct mmc_host *mmc) -{ - struct tmio_mmc_host *host = mmc_priv(mmc); - - if (host->hw_reset) - host->hw_reset(host); -} - static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) { struct tmio_mmc_host *host = mmc_priv(mmc); @@ -1008,7 +1000,6 @@ static struct mmc_host_ops tmio_mmc_ops = { .get_cd = tmio_mmc_get_cd, .enable_sdio_irq = tmio_mmc_enable_sdio_irq, .multi_io_quirk = tmio_multi_io_quirk, - .hw_reset = tmio_mmc_hw_reset, .execute_tuning = tmio_mmc_execute_tuning, }; diff --git a/drivers/mmc/host/uniphier-sd.c b/drivers/mmc/host/uniphier-sd.c index f82baf99fd69..55efd5c53249 100644 --- a/drivers/mmc/host/uniphier-sd.c +++ b/drivers/mmc/host/uniphier-sd.c @@ -409,8 +409,9 @@ static void uniphier_sd_clk_disable(struct tmio_mmc_host *host) clk_disable_unprepare(priv->clk); } -static void uniphier_sd_hw_reset(struct tmio_mmc_host *host) +static void uniphier_sd_hw_reset(struct mmc_host *mmc) { + struct tmio_mmc_host *host = mmc_priv(mmc); struct uniphier_sd_priv *priv = uniphier_sd_priv(host); reset_control_assert(priv->rst_hw); @@ -597,7 +598,7 @@ static int uniphier_sd_probe(struct platform_device *pdev) ret = PTR_ERR(priv->rst_hw); goto free_host; } - host->hw_reset = uniphier_sd_hw_reset; + host->ops.hw_reset = uniphier_sd_hw_reset; } if (host->mmc->caps & MMC_CAP_UHS) { From 576146ea4ed986f44a4406ec32a5d3363216b3d2 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 20 Aug 2020 15:25:36 +0200 Subject: [PATCH 033/271] mmc: tmio: factor out common parts of the reset routine Some TMIO variants need specific actions in their reset routine, but they are all based on a generic reset routine. So, the optional 'reset' callback will now only take care of the additional stuff and we will have a generic function around it. Less code, easier to maintain, and much more readable. Code in tmio_mmc.c is untested but in my TC6387XB datasheet the SDIO part is reset independently from the SD part, too. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200820132538.24758-5-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 11 ----------- drivers/mmc/host/tmio_mmc.c | 8 -------- drivers/mmc/host/tmio_mmc_core.c | 14 +++++++------- 3 files changed, 7 insertions(+), 26 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index eced87845784..9b2d4e8039d9 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -635,17 +635,6 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) { struct renesas_sdhi *priv = host_to_priv(host); - /* FIXME - should we set stop clock reg here */ - sd_ctrl_write16(host, CTL_RESET_SD, 0x0000); - usleep_range(10000, 11000); - sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); - usleep_range(10000, 11000); - - if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { - sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); - sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); - } - renesas_sdhi_reset_scc(host, priv); renesas_sdhi_reset_hs400_mode(host, priv); diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c index 93e83ad25976..757c90160ae4 100644 --- a/drivers/mmc/host/tmio_mmc.c +++ b/drivers/mmc/host/tmio_mmc.c @@ -77,18 +77,10 @@ static void tmio_mmc_set_clock(struct tmio_mmc_host *host, static void tmio_mmc_reset(struct tmio_mmc_host *host) { - /* FIXME - should we set stop clock reg here */ - sd_ctrl_write16(host, CTL_RESET_SD, 0x0000); sd_ctrl_write16(host, CTL_RESET_SDIO, 0x0000); usleep_range(10000, 11000); - sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); sd_ctrl_write16(host, CTL_RESET_SDIO, 0x0001); usleep_range(10000, 11000); - - if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { - sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); - sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); - } } #ifdef CONFIG_PM_SLEEP diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index a4bd0a0a305c..f68c10b8ed61 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -172,6 +172,9 @@ static void tmio_mmc_reset(struct tmio_mmc_host *host) sd_ctrl_write16(host, CTL_RESET_SD, 0x0001); usleep_range(10000, 11000); + if (host->reset) + host->reset(host); + if (host->pdata->flags & TMIO_MMC_SDIO_IRQ) { sd_ctrl_write16(host, CTL_SDIO_IRQ_MASK, host->sdio_irq_mask); sd_ctrl_write16(host, CTL_TRANSACTION_CTL, 0x0001); @@ -216,7 +219,7 @@ static void tmio_mmc_reset_work(struct work_struct *work) spin_unlock_irqrestore(&host->lock, flags); - host->reset(host); + tmio_mmc_reset(host); /* Ready for new calls */ host->mrq = NULL; @@ -721,7 +724,7 @@ static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) if (ret < 0) { dev_warn(&host->pdev->dev, "Tuning procedure failed\n"); - host->reset(host); + tmio_mmc_reset(host); } return ret; @@ -1144,9 +1147,6 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) mmc->caps & MMC_CAP_NEEDS_POLL || !mmc_card_is_removable(mmc)); - if (!_host->reset) - _host->reset = tmio_mmc_reset; - /* * On Gen2+, eMMC with NONREMOVABLE currently fails because native * hotplug gets disabled. It seems RuntimePM related yet we need further @@ -1168,7 +1168,7 @@ int tmio_mmc_host_probe(struct tmio_mmc_host *_host) _host->sdio_irq_mask = TMIO_SDIO_MASK_ALL; _host->set_clock(_host, 0); - _host->reset(_host); + tmio_mmc_reset(_host); _host->sdcard_irq_mask = sd_ctrl_read16_and_16_as_32(_host, CTL_IRQ_MASK); tmio_mmc_disable_mmc_irqs(_host, TMIO_MASK_ALL); @@ -1271,7 +1271,7 @@ int tmio_mmc_host_runtime_resume(struct device *dev) struct tmio_mmc_host *host = dev_get_drvdata(dev); tmio_mmc_clk_enable(host); - host->reset(host); + tmio_mmc_reset(host); if (host->clk_cache) host->set_clock(host, host->clk_cache); From 5b0739d76227fd5a3f02f014385bfa9c86e0404b Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 20 Aug 2020 15:25:37 +0200 Subject: [PATCH 034/271] mmc: tmio: don't reset whole IP core when tuning fails SDHI needs to reset the SCC only, not the whole IP core. So, if tuning fails, don't handle specifics in the generic TMIO core, but in the specific drivers. For SDHI, we need to move around the reset routine a bit. It is not modified. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200820132538.24758-6-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 45 +++++++++++++++------------- drivers/mmc/host/tmio_mmc_core.c | 4 +-- 2 files changed, 25 insertions(+), 24 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 9b2d4e8039d9..50a92d49efd9 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -432,6 +432,25 @@ static int renesas_sdhi_prepare_hs400_tuning(struct mmc_host *mmc, struct mmc_io return 0; } +static void renesas_sdhi_reset(struct tmio_mmc_host *host) +{ + struct renesas_sdhi *priv = host_to_priv(host); + + renesas_sdhi_reset_scc(host, priv); + renesas_sdhi_reset_hs400_mode(host, priv); + + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | + sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); + + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL, + ~SH_MOBILE_SDHI_SCC_RVSCNTL_RVSEN & + sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL)); + + if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) + sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, + TMIO_MASK_INIT_RCAR2); +} + #define SH_MOBILE_SDHI_MIN_TAP_ROW 3 static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) @@ -503,7 +522,7 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) { struct renesas_sdhi *priv = host_to_priv(host); - int i; + int i, ret; priv->tap_num = renesas_sdhi_init_tuning(host); if (!priv->tap_num) @@ -531,7 +550,10 @@ static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) set_bit(i, priv->smpcmp); } - return renesas_sdhi_select_tuning(host); + ret = renesas_sdhi_select_tuning(host); + if (ret < 0) + renesas_sdhi_reset(host); + return ret; } static bool renesas_sdhi_manual_correction(struct tmio_mmc_host *host, bool use_4tap) @@ -631,25 +653,6 @@ static bool renesas_sdhi_check_scc_error(struct tmio_mmc_host *host) return renesas_sdhi_manual_correction(host, use_4tap); } -static void renesas_sdhi_reset(struct tmio_mmc_host *host) -{ - struct renesas_sdhi *priv = host_to_priv(host); - - renesas_sdhi_reset_scc(host, priv); - renesas_sdhi_reset_hs400_mode(host, priv); - - sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | - sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); - - sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL, - ~SH_MOBILE_SDHI_SCC_RVSCNTL_RVSEN & - sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL)); - - if (host->pdata->flags & TMIO_MMC_MIN_RCAR2) - sd_ctrl_write32_as_16_and_16(host, CTL_IRQ_MASK, - TMIO_MASK_INIT_RCAR2); -} - static int renesas_sdhi_wait_idle(struct tmio_mmc_host *host, u32 bit) { int timeout = 1000; diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index f68c10b8ed61..e7bad761c714 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -722,10 +722,8 @@ static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) ret = host->execute_tuning(host, opcode); - if (ret < 0) { + if (ret < 0) dev_warn(&host->pdev->dev, "Tuning procedure failed\n"); - tmio_mmc_reset(host); - } return ret; } From 510bfe58936f7d79d4c377bcc7425244d879b1ff Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 20 Aug 2020 15:25:38 +0200 Subject: [PATCH 035/271] mmc: tmio: remove indirection of 'execute_tuning' callback After all the previous refactorization, we can now populate mmc_ops directly and don't need a layer inbetween. The NULL-pointer check and the error printout are already done by the MMC core. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200820132538.24758-7-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 7 ++++--- drivers/mmc/host/tmio_mmc.h | 6 ------ drivers/mmc/host/tmio_mmc_core.c | 17 ----------------- 3 files changed, 4 insertions(+), 26 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 50a92d49efd9..1243c42d3d1b 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -519,8 +519,9 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) return 0; } -static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) +static int renesas_sdhi_execute_tuning(struct mmc_host *mmc, u32 opcode) { + struct tmio_mmc_host *host = mmc_priv(mmc); struct renesas_sdhi *priv = host_to_priv(host); int i, ret; @@ -543,7 +544,7 @@ static int renesas_sdhi_execute_tuning(struct tmio_mmc_host *host, u32 opcode) /* Set sampling clock position */ sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TAPSET, i % priv->tap_num); - if (mmc_send_tuning(host->mmc, opcode, NULL) == 0) + if (mmc_send_tuning(mmc, opcode, NULL) == 0) set_bit(i, priv->taps); if (sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_SMPCMP) == 0) @@ -942,8 +943,8 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (!hit) dev_warn(&host->pdev->dev, "Unknown clock rate for tuning\n"); - host->execute_tuning = renesas_sdhi_execute_tuning; host->check_retune = renesas_sdhi_check_scc_error; + host->ops.execute_tuning = renesas_sdhi_execute_tuning; host->ops.prepare_hs400_tuning = renesas_sdhi_prepare_hs400_tuning; host->ops.hs400_downgrade = renesas_sdhi_disable_scc; host->ops.hs400_complete = renesas_sdhi_hs400_complete; diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index dac717041c2d..51b5f388f6d8 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -180,12 +180,6 @@ struct tmio_mmc_host { void (*reset)(struct tmio_mmc_host *host); bool (*check_retune)(struct tmio_mmc_host *host); - /* - * Mandatory callback for tuning to occur which is optional for SDR50 - * and mandatory for SDR104. - */ - int (*execute_tuning)(struct tmio_mmc_host *host, u32 opcode); - void (*prepare_hs400_tuning)(struct tmio_mmc_host *host); void (*hs400_downgrade)(struct tmio_mmc_host *host); void (*hs400_complete)(struct tmio_mmc_host *host); diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index e7bad761c714..0f266cbf82b8 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -712,22 +712,6 @@ static int tmio_mmc_start_data(struct tmio_mmc_host *host, return 0; } -static int tmio_mmc_execute_tuning(struct mmc_host *mmc, u32 opcode) -{ - struct tmio_mmc_host *host = mmc_priv(mmc); - int ret; - - if (!host->execute_tuning) - return 0; - - ret = host->execute_tuning(host, opcode); - - if (ret < 0) - dev_warn(&host->pdev->dev, "Tuning procedure failed\n"); - - return ret; -} - static void tmio_process_mrq(struct tmio_mmc_host *host, struct mmc_request *mrq) { @@ -1001,7 +985,6 @@ static struct mmc_host_ops tmio_mmc_ops = { .get_cd = tmio_mmc_get_cd, .enable_sdio_irq = tmio_mmc_enable_sdio_irq, .multi_io_quirk = tmio_multi_io_quirk, - .execute_tuning = tmio_mmc_execute_tuning, }; static int tmio_mmc_init_ocr(struct tmio_mmc_host *host) From 4dac686bb3571b65b58316991eb833431a3d19f1 Mon Sep 17 00:00:00 2001 From: Veerabhadrarao Badiganti Date: Thu, 27 Aug 2020 20:51:28 +0530 Subject: [PATCH 036/271] mmc: sdhci-msm: Enable restore_dll_config flag for sc7180 target On sc7180 target, issues are observed with HS400 mode due to a hardware limitation. If sdcc clock is dynamically gated and ungated, the very next command is failing with command CRC/timeout errors. To mitigate this issue, DLL phase has to be restored whenever sdcc clock is gated dynamically. The restore_dll_config ensures this. Enabling this flag with this change. And simply re-using the sdm845 target configuration for this flag. Signed-off-by: Veerabhadrarao Badiganti Reviewed-by: Douglas Anderson Tested-by: Douglas Anderson Link: https://lore.kernel.org/r/1598541694-15694-1-git-send-email-vbadigan@codeaurora.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 5a33389037cd..d4c02884cca8 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2151,6 +2151,7 @@ static const struct of_device_id sdhci_msm_dt_match[] = { {.compatible = "qcom,sdhci-msm-v5", .data = &sdhci_msm_v5_var}, {.compatible = "qcom,sdm845-sdhci", .data = &sdm845_sdhci_var}, {.compatible = "qcom,sm8250-sdhci", .data = &sm8250_sdhci_var}, + {.compatible = "qcom,sc7180-sdhci", .data = &sdm845_sdhci_var}, {}, }; From 887ba410ede62da12e77684072f6c7491c0fa909 Mon Sep 17 00:00:00 2001 From: Chun-Hung Wu Date: Thu, 27 Aug 2020 17:33:02 +0800 Subject: [PATCH 037/271] mmc: cqhci: add new cqhci_host_ops pre_enable() and post_disable() Add pre_enable() and post_disable() for cqhci_host_ops. Add hook functions before cqhci enable and after cqhci disable for platforms need them. Signed-off-by: Chun-Hung Wu Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/1598520783-25250-2-git-send-email-chun-hung.wu@mediatek.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/cqhci.c | 6 ++++++ drivers/mmc/host/cqhci.h | 2 ++ 2 files changed, 8 insertions(+) diff --git a/drivers/mmc/host/cqhci.c b/drivers/mmc/host/cqhci.c index cfa87dfa73d8..697fe40756bf 100644 --- a/drivers/mmc/host/cqhci.c +++ b/drivers/mmc/host/cqhci.c @@ -376,6 +376,9 @@ static void cqhci_off(struct mmc_host *mmc) else pr_debug("%s: cqhci: CQE off\n", mmc_hostname(mmc)); + if (cq_host->ops->post_disable) + cq_host->ops->post_disable(mmc); + mmc->cqe_on = false; } @@ -580,6 +583,9 @@ static int cqhci_request(struct mmc_host *mmc, struct mmc_request *mrq) __cqhci_enable(cq_host); if (!mmc->cqe_on) { + if (cq_host->ops->pre_enable) + cq_host->ops->pre_enable(mmc); + cqhci_writel(cq_host, 0, CQHCI_CTL); mmc->cqe_on = true; pr_debug("%s: cqhci: CQE on\n", mmc_hostname(mmc)); diff --git a/drivers/mmc/host/cqhci.h b/drivers/mmc/host/cqhci.h index 437700179de4..89bf6adbce8c 100644 --- a/drivers/mmc/host/cqhci.h +++ b/drivers/mmc/host/cqhci.h @@ -206,6 +206,8 @@ struct cqhci_host_ops { void (*disable)(struct mmc_host *mmc, bool recovery); void (*update_dcmd_desc)(struct mmc_host *mmc, struct mmc_request *mrq, u64 *data); + void (*pre_enable)(struct mmc_host *mmc); + void (*post_disable)(struct mmc_host *mmc); }; static inline void cqhci_writel(struct cqhci_host *host, u32 val, int reg) From e282f2045a3df69e8303488684bc9f3a23d2677d Mon Sep 17 00:00:00 2001 From: Chun-Hung Wu Date: Thu, 27 Aug 2020 17:33:03 +0800 Subject: [PATCH 038/271] mmc: mediatek: add pre_enable() and post_disable() hook function CQHCI_ENABLE bit in CQHCI_CFG should be disabled after msdc_cqe_disable(), and should be enabled before msdc_ceq_enable() for MTK platform. Add hook functions for cqhci_host_ops->pre_enable() and cqhci_host_ops->post_disable(). Signed-off-by: Chun-Hung Wu Acked-by: Adrian Hunter Link: https://lore.kernel.org/r/1598520783-25250-3-git-send-email-chun-hung.wu@mediatek.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index b0c27944db7f..ed2b24691b4f 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2290,6 +2290,26 @@ static void msdc_cqe_disable(struct mmc_host *mmc, bool recovery) } } +static void msdc_cqe_pre_enable(struct mmc_host *mmc) +{ + struct cqhci_host *cq_host = mmc->cqe_private; + u32 reg; + + reg = cqhci_readl(cq_host, CQHCI_CFG); + reg |= CQHCI_ENABLE; + cqhci_writel(cq_host, reg, CQHCI_CFG); +} + +static void msdc_cqe_post_disable(struct mmc_host *mmc) +{ + struct cqhci_host *cq_host = mmc->cqe_private; + u32 reg; + + reg = cqhci_readl(cq_host, CQHCI_CFG); + reg &= ~CQHCI_ENABLE; + cqhci_writel(cq_host, reg, CQHCI_CFG); +} + static const struct mmc_host_ops mt_msdc_ops = { .post_req = msdc_post_req, .pre_req = msdc_pre_req, @@ -2309,6 +2329,8 @@ static const struct mmc_host_ops mt_msdc_ops = { static const struct cqhci_host_ops msdc_cmdq_ops = { .enable = msdc_cqe_enable, .disable = msdc_cqe_disable, + .pre_enable = msdc_cqe_pre_enable, + .post_disable = msdc_cqe_post_disable, }; static void msdc_of_property_parse(struct platform_device *pdev, From 8d62fa831fd2a3a58726f731726f30fc425d7f55 Mon Sep 17 00:00:00 2001 From: Stefan Wahren Date: Fri, 28 Aug 2020 23:47:14 +0200 Subject: [PATCH 039/271] mmc: sdhci-iproc: Enable eMMC DDR 3.3V support for bcm2711 The emmc2 interface on the bcm2711 supports DDR modes for eMMC devices running at 3.3V. This allows to run eMMC module with 3.3V signaling voltage at DDR52 mode on the Raspberry Pi 4 using a SD adapter: clock: 52000000 Hz actual clock: 50000000 Hz vdd: 21 (3.3 ~ 3.4 V) bus mode: 2 (push-pull) chip select: 0 (don't care) power mode: 2 (on) bus width: 2 (4 bits) timing spec: 8 (mmc DDR52) signal voltage: 0 (3.30 V) driver type: 0 (driver type B) Link: https://github.com/raspberrypi/linux/issues/3802 Signed-off-by: Stefan Wahren Link: https://lore.kernel.org/r/1598651234-29826-1-git-send-email-stefan.wahren@i2se.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-iproc.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c index e2d8dfe90077..b540aa6faacb 100644 --- a/drivers/mmc/host/sdhci-iproc.c +++ b/drivers/mmc/host/sdhci-iproc.c @@ -283,6 +283,7 @@ static const struct sdhci_pltfm_data sdhci_bcm2711_pltfm_data = { static const struct sdhci_iproc_data bcm2711_data = { .pdata = &sdhci_bcm2711_pltfm_data, + .mmc_caps = MMC_CAP_3_3V_DDR, }; static const struct of_device_id sdhci_iproc_of_match[] = { From 80fd350b9590d567196de215391141880cc8ea5a Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Sat, 29 Aug 2020 08:25:05 +0200 Subject: [PATCH 040/271] dt-bindings: mmc: fsl-imx-esdhc: Fix i.MX 8 compatible matching The i.MX 8 DTSes use two compatibles so update the binding to fix dtbs_check warnings like: arch/arm64/boot/dts/freescale/imx8mn-evk.dt.yaml: mmc@30b40000: compatible: ['fsl,imx8mn-usdhc', 'fsl,imx7d-usdhc'] is too long From schema: Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml arch/arm64/boot/dts/freescale/imx8mn-evk.dt.yaml: mmc@30b40000: compatible: Additional items are not allowed ('fsl,imx7d-usdhc' was unexpected) arch/arm64/boot/dts/freescale/imx8mn-ddr4-evk.dt.yaml: mmc@30b40000: compatible: ['fsl,imx8mn-usdhc', 'fsl,imx7d-usdhc'] is too long Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200829062505.4642-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- .../bindings/clock/imx8qxp-lpcg.yaml | 2 +- .../bindings/mmc/fsl-imx-esdhc.yaml | 37 ++++++++++--------- 2 files changed, 21 insertions(+), 18 deletions(-) diff --git a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml index 1d5e9bcce4c8..33f3010f48c3 100644 --- a/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml +++ b/Documentation/devicetree/bindings/clock/imx8qxp-lpcg.yaml @@ -62,7 +62,7 @@ examples: }; mmc@5b010000 { - compatible = "fsl,imx8qxp-usdhc"; + compatible = "fsl,imx8qxp-usdhc", "fsl,imx7d-usdhc"; interrupts = ; reg = <0x5b010000 0x10000>; clocks = <&conn_lpcg IMX_CONN_LPCG_SDHC0_IPG_CLK>, diff --git a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml index 10b45966f1b8..e71d13c2d109 100644 --- a/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml +++ b/Documentation/devicetree/bindings/mmc/fsl-imx-esdhc.yaml @@ -21,23 +21,26 @@ description: | properties: compatible: - enum: - - fsl,imx25-esdhc - - fsl,imx35-esdhc - - fsl,imx51-esdhc - - fsl,imx53-esdhc - - fsl,imx6q-usdhc - - fsl,imx6sl-usdhc - - fsl,imx6sx-usdhc - - fsl,imx6ull-usdhc - - fsl,imx7d-usdhc - - fsl,imx7ulp-usdhc - - fsl,imx8mq-usdhc - - fsl,imx8mm-usdhc - - fsl,imx8mn-usdhc - - fsl,imx8mp-usdhc - - fsl,imx8qm-usdhc - - fsl,imx8qxp-usdhc + oneOf: + - enum: + - fsl,imx25-esdhc + - fsl,imx35-esdhc + - fsl,imx51-esdhc + - fsl,imx53-esdhc + - fsl,imx6q-usdhc + - fsl,imx6sl-usdhc + - fsl,imx6sx-usdhc + - fsl,imx6ull-usdhc + - fsl,imx7d-usdhc + - fsl,imx7ulp-usdhc + - items: + - enum: + - fsl,imx8mm-usdhc + - fsl,imx8mn-usdhc + - fsl,imx8mp-usdhc + - fsl,imx8mq-usdhc + - fsl,imx8qxp-usdhc + - const: fsl,imx7d-usdhc reg: maxItems: 1 From 9381118fcfd7794dee7e2a8fd1bbbbc3ecb9c240 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 31 Aug 2020 18:11:47 +0200 Subject: [PATCH 041/271] dt-bindings: mmc: mmc-pwreq-simple: Accept more than one reset GPIO There might be multiple reset GPIOs but dtschema has trouble parsing it if there are no maxItems: arch/arm/boot/dts/exynos5250-snow.dt.yaml: mmc3_pwrseq: reset-gpios: [[20, 2, 1], [20, 1, 1]] is too long From schema: Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200831161147.13515-2-krzk@kernel.org Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml | 2 ++ 1 file changed, 2 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml index 449215444723..8d625f903856 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml +++ b/Documentation/devicetree/bindings/mmc/mmc-pwrseq-simple.yaml @@ -20,6 +20,8 @@ properties: reset-gpios: minItems: 1 + # Put some limit to avoid false warnings + maxItems: 32 description: contains a list of GPIO specifiers. The reset GPIOs are asserted at initialization and prior we start the power up procedure of the card. From e0c29be62cdc32605281024a662b2f8f690a3f77 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 31 Aug 2020 12:23:24 +0200 Subject: [PATCH 042/271] mmc: meson-gx: use wrapper to avoid accessing internal vars Signed-off-by: Wolfram Sang Reviewed-by: Kevin Hilman Link: https://lore.kernel.org/r/20200831102324.12566-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index 08a3b1c05acb..a1db8685e30e 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -521,7 +521,7 @@ static int meson_mmc_resampling_tuning(struct mmc_host *mmc, u32 opcode) val |= ADJUST_ADJ_EN; writel(val, host->regs + host->data->adjust); - if (mmc->doing_retune) + if (mmc_doing_retune(mmc)) dly = FIELD_GET(ADJUST_ADJ_DELAY_MASK, val) + 1; else dly = 0; From 1796164fac7e348f74a0f1f1cae995b22d002315 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Tue, 1 Sep 2020 10:50:03 +0200 Subject: [PATCH 043/271] dt-bindings: mmc: document alias support As for I2C and SPI, it now is possible to reserve a fixed index for mmc/mmcblk devices. Signed-off-by: Matthias Schiffer Link: https://lore.kernel.org/r/20200901085004.2512-1-matthias.schiffer@ew.tq-group.com Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/mmc-controller.yaml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml index b96da0c7f819..f928f66fc59a 100644 --- a/Documentation/devicetree/bindings/mmc/mmc-controller.yaml +++ b/Documentation/devicetree/bindings/mmc/mmc-controller.yaml @@ -14,6 +14,10 @@ description: | that requires the respective functionality should implement them using these definitions. + It is possible to assign a fixed index mmcN to an MMC host controller + (and the corresponding mmcblkN devices) by defining an alias in the + /aliases device tree node. + properties: $nodename: pattern: "^mmc(@.*)?$" From fa2d0aa96941e8dc347e49a04c75468e3002ee20 Mon Sep 17 00:00:00 2001 From: Matthias Schiffer Date: Tue, 1 Sep 2020 10:50:04 +0200 Subject: [PATCH 044/271] mmc: core: Allow setting slot index via device tree alias As with GPIO, UART and others, allow specifying the device index via the aliases node in the device tree. On embedded devices, there is often a combination of removable (e.g. SD card) and non-removable MMC devices (e.g. eMMC). Therefore the index might change depending on * host of removable device * removable card present or not This makes it difficult to hardcode the root device, if it is on the non-removable device. E.g. if SD card is present eMMC will be mmcblk1, if SD card is not present at boot, eMMC will be mmcblk0. Alternative solutions like PARTUUIDs do not cover the case where multiple mmcblk devices contain the same image. This is a common issue on devices that can boot both from eMMC (for regular boot) and SD cards (as a temporary boot medium for development). When a firmware image is installed to eMMC after a test boot via SD card, there will be no reliable way to refer to a specific device using (PART)UUIDs oder LABELs. The demand for this feature has led to multiple attempts to implement it, dating back at least to 2012 (see https://www.spinics.net/lists/linux-mmc/msg26586.html for a previous discussion from 2014). All indices defined in the aliases node will be reserved for use by the respective MMC device, moving the indices of devices that don't have an alias up into the non-reserved range. If the aliases node is not found, the driver will act as before. This is a rebased and cleaned up version of https://www.spinics.net/lists/linux-mmc/msg26588.html . Based-on-patch-by: Sascha Hauer Link: https://lkml.org/lkml/2020/8/5/194 Signed-off-by: Matthias Schiffer Link: https://lore.kernel.org/r/20200901085004.2512-2-matthias.schiffer@ew.tq-group.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/host.c | 26 +++++++++++++++++++++++++- 1 file changed, 25 insertions(+), 1 deletion(-) diff --git a/drivers/mmc/core/host.c b/drivers/mmc/core/host.c index c8fae6611b73..96b2ca1f1b06 100644 --- a/drivers/mmc/core/host.c +++ b/drivers/mmc/core/host.c @@ -376,6 +376,20 @@ int mmc_of_parse_voltage(struct device_node *np, u32 *mask) } EXPORT_SYMBOL(mmc_of_parse_voltage); +/** + * mmc_first_nonreserved_index() - get the first index that is not reserved + */ +static int mmc_first_nonreserved_index(void) +{ + int max; + + max = of_alias_get_highest_id("mmc"); + if (max < 0) + return 0; + + return max + 1; +} + /** * mmc_alloc_host - initialise the per-host structure. * @extra: sizeof private data structure @@ -387,6 +401,7 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) { int err; struct mmc_host *host; + int alias_id, min_idx, max_idx; host = kzalloc(sizeof(struct mmc_host) + extra, GFP_KERNEL); if (!host) @@ -395,7 +410,16 @@ struct mmc_host *mmc_alloc_host(int extra, struct device *dev) /* scanning will be enabled when we're ready */ host->rescan_disable = 1; - err = ida_simple_get(&mmc_host_ida, 0, 0, GFP_KERNEL); + alias_id = of_alias_get_id(dev->of_node, "mmc"); + if (alias_id >= 0) { + min_idx = alias_id; + max_idx = alias_id + 1; + } else { + min_idx = mmc_first_nonreserved_index(); + max_idx = 0; + } + + err = ida_simple_get(&mmc_host_ida, min_idx, max_idx, GFP_KERNEL); if (err < 0) { kfree(host); return NULL; From aec429e09e2d51e34ecdea5446d6061ec2e4bf74 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:48 +0200 Subject: [PATCH 045/271] mmc: bcm2835: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20200902193658.20539-2-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/bcm2835.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index a0767790a826..35320bc9dc02 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1406,9 +1406,7 @@ static int bcm2835_probe(struct platform_device *pdev) clk = devm_clk_get(dev, NULL); if (IS_ERR(clk)) { - ret = PTR_ERR(clk); - if (ret != -EPROBE_DEFER) - dev_err(dev, "could not get clk: %d\n", ret); + ret = dev_err_probe(dev, PTR_ERR(clk), "could not get clk\n"); goto err; } From 3a35e7e1bd50f794ede9ed1291402bacf2c79480 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:49 +0200 Subject: [PATCH 046/271] mmc: davinci: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-3-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/davinci_mmc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index e50a08bce7ef..fad1010fb52b 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -1240,9 +1240,8 @@ static int davinci_mmcsd_probe(struct platform_device *pdev) pdev->id_entry = match->data; ret = mmc_of_parse(mmc); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, - "could not parse of data: %d\n", ret); + dev_err_probe(&pdev->dev, ret, + "could not parse of data\n"); goto parse_fail; } } else { From 380a99a2d084d78d6dd47f63f3a178e24321c681 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:50 +0200 Subject: [PATCH 047/271] mmc: dw_mmc-zx: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-4-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-zx.c | 11 +++-------- 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/drivers/mmc/host/dw_mmc-zx.c b/drivers/mmc/host/dw_mmc-zx.c index eada648b27ec..d9e483432a61 100644 --- a/drivers/mmc/host/dw_mmc-zx.c +++ b/drivers/mmc/host/dw_mmc-zx.c @@ -155,7 +155,6 @@ static int dw_mci_zx_parse_dt(struct dw_mci *host) struct device_node *node; struct dw_mci_zx_priv_data *priv; struct regmap *sysc_base; - int ret; /* syscon is needed only by emmc */ node = of_parse_phandle(np, "zte,aon-syscon", 0); @@ -163,13 +162,9 @@ static int dw_mci_zx_parse_dt(struct dw_mci *host) sysc_base = syscon_node_to_regmap(node); of_node_put(node); - if (IS_ERR(sysc_base)) { - ret = PTR_ERR(sysc_base); - if (ret != -EPROBE_DEFER) - dev_err(host->dev, "Can't get syscon: %d\n", - ret); - return ret; - } + if (IS_ERR(sysc_base)) + return dev_err_probe(host->dev, PTR_ERR(sysc_base), + "Can't get syscon\n"); } else { return 0; } From 295208a6666de8c438054c625d97138a05087655 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:51 +0200 Subject: [PATCH 048/271] mmc: jz4740: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-5-krzk@kernel.org Reviewed-by: Paul Cercueil Signed-off-by: Ulf Hansson --- drivers/mmc/host/jz4740_mmc.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index 81d71010b474..fea0a42010c7 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -991,9 +991,7 @@ static int jz4740_mmc_probe(struct platform_device* pdev) ret = mmc_of_parse(mmc); if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, - "could not parse device properties: %d\n", ret); + dev_err_probe(&pdev->dev, ret, "could not parse device properties\n"); goto err_free_host; } From 89280d0963fdb615f8fdfc96592f9aab27592c14 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:52 +0200 Subject: [PATCH 049/271] mmc: meson: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-6-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-gx-mmc.c | 16 +++++----------- 1 file changed, 5 insertions(+), 11 deletions(-) diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index a1db8685e30e..a68b43082f61 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -426,11 +426,9 @@ static int meson_mmc_clk_init(struct meson_host *host) snprintf(name, sizeof(name), "clkin%d", i); clk = devm_clk_get(host->dev, name); - if (IS_ERR(clk)) { - if (clk != ERR_PTR(-EPROBE_DEFER)) - dev_err(host->dev, "Missing clock %s\n", name); - return PTR_ERR(clk); - } + if (IS_ERR(clk)) + return dev_err_probe(host->dev, PTR_ERR(clk), + "Missing clock %s\n", name); mux_parent_names[i] = __clk_get_name(clk); } @@ -1077,12 +1075,8 @@ static int meson_mmc_probe(struct platform_device *pdev) } ret = device_reset_optional(&pdev->dev); - if (ret) { - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "device reset failed: %d\n", ret); - - return ret; - } + if (ret) + return dev_err_probe(&pdev->dev, ret, "device reset failed\n"); res = platform_get_resource(pdev, IORESOURCE_MEM, 0); host->regs = devm_ioremap_resource(&pdev->dev, res); From 72ea817db5d26e2dab3695a576b5557ada9cd722 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:54 +0200 Subject: [PATCH 050/271] mmc: sdhci-of-arasan: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-8-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-arasan.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index f186fbd016b1..3aea46db2ea7 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -1543,10 +1543,9 @@ static int sdhci_arasan_probe(struct platform_device *pdev) of_node_put(node); if (IS_ERR(sdhci_arasan->soc_ctl_base)) { - ret = PTR_ERR(sdhci_arasan->soc_ctl_base); - if (ret != -EPROBE_DEFER) - dev_err(&pdev->dev, "Can't get syscon: %d\n", - ret); + ret = dev_err_probe(&pdev->dev, + PTR_ERR(sdhci_arasan->soc_ctl_base), + "Can't get syscon\n"); goto err_pltfm_free; } } From 180a46652e09c7882330ab9ace060f0e13ca13f0 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:55 +0200 Subject: [PATCH 051/271] mmc: sdhci-tegra: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code and the error value gets printed. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-9-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-tegra.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 13fbf70b5fde..428cdc53c858 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1660,11 +1660,8 @@ static int sdhci_tegra_probe(struct platform_device *pdev) clk = devm_clk_get(mmc_dev(host->mmc), NULL); if (IS_ERR(clk)) { - rc = PTR_ERR(clk); - - if (rc != -EPROBE_DEFER) - dev_err(&pdev->dev, "failed to get clock: %d\n", rc); - + rc = dev_err_probe(&pdev->dev, PTR_ERR(clk), + "failed to get clock\n"); goto err_clk_get; } clk_prepare_enable(clk); From 308d2722f6430839f06572af776f37f7af697134 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:56 +0200 Subject: [PATCH 052/271] mmc: dw_mmc: Simplify with dev_err_probe() Common pattern of handling deferred probe can be simplified with dev_err_probe(). Less code, the error value gets printed and real error from dw_mci_parse_dt() is passed further instead of fixed -EINVAL. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-10-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/dw_mmc.c b/drivers/mmc/host/dw_mmc.c index 0fba940544ca..43c5795691fb 100644 --- a/drivers/mmc/host/dw_mmc.c +++ b/drivers/mmc/host/dw_mmc.c @@ -3161,12 +3161,9 @@ int dw_mci_probe(struct dw_mci *host) if (!host->pdata) { host->pdata = dw_mci_parse_dt(host); - if (PTR_ERR(host->pdata) == -EPROBE_DEFER) { - return -EPROBE_DEFER; - } else if (IS_ERR(host->pdata)) { - dev_err(host->dev, "platform data not available\n"); - return -EINVAL; - } + if (IS_ERR(host->pdata)) + return dev_err_probe(host->dev, PTR_ERR(host->pdata), + "platform data not available\n"); } host->biu_clk = devm_clk_get(host->dev, "biu"); From b5f9a2c686d91ad5a97ee4a5be9ae1e6da8d4789 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:57 +0200 Subject: [PATCH 053/271] mmc: sdhci-of-sparx5: Use proper printk format for dma_addr_t MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit dma_addr_t size varies between architectures so use dedicated printk format to fix compile testing warning (e.g. on 32-bit MIPS): drivers/mmc/host/sdhci-of-sparx5.c:63:11: warning: format ‘%llx’ expects argument of type ‘long long unsigned int’, but argument 5 has type ‘dma_addr_t {aka unsigned int}’ [-Wformat=] Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902193658.20539-11-krzk@kernel.org Acked-by: Lars Povlsen Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-sparx5.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-sparx5.c b/drivers/mmc/host/sdhci-of-sparx5.c index 14b5dad6575a..747f108a0ace 100644 --- a/drivers/mmc/host/sdhci-of-sparx5.c +++ b/drivers/mmc/host/sdhci-of-sparx5.c @@ -60,8 +60,8 @@ static void sdhci_sparx5_adma_write_desc(struct sdhci_host *host, void **desc, return; } - pr_debug("%s: write_desc: splitting dma len %d, offset 0x%0llx\n", - mmc_hostname(host->mmc), len, addr); + pr_debug("%s: write_desc: splitting dma len %d, offset %pad\n", + mmc_hostname(host->mmc), len, &addr); offset = addr & (SZ_128M - 1); tmplen = SZ_128M - offset; From b41123f4f57df6950f3316948989783bbdefa7e2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 21:36:53 +0200 Subject: [PATCH 054/271] mmc: sdhci-brcmstb: Simplify with optional clock and dev_err_probe() Only -ENOENT from devm_clk_get() means that clock is not present in device tree. Other errors have their own meaning and should not be ignored. Simplify getting the clock which is in fact optional and also use dev_err_probe() for handling deferred. Signed-off-by: Krzysztof Kozlowski Acked-by: Florian Fainelli Link: https://lore.kernel.org/r/20200902193658.20539-7-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-brcmstb.c | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index ad01f6451a95..876668827580 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -235,13 +235,11 @@ static int sdhci_brcmstb_probe(struct platform_device *pdev) dev_dbg(&pdev->dev, "Probe found match for %s\n", match->compatible); - clk = devm_clk_get(&pdev->dev, NULL); - if (IS_ERR(clk)) { - if (PTR_ERR(clk) == -EPROBE_DEFER) - return -EPROBE_DEFER; - dev_err(&pdev->dev, "Clock not found in Device Tree\n"); - clk = NULL; - } + clk = devm_clk_get_optional(&pdev->dev, NULL); + if (IS_ERR(clk)) + return dev_err_probe(&pdev->dev, PTR_ERR(clk), + "Failed to get clock from Device Tree\n"); + res = clk_prepare_enable(clk); if (res) return res; From 8c7f51effd7387346e2754b545e224a68b9580ec Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 22:48:45 +0200 Subject: [PATCH 055/271] mmc: davinci: Fix -Wpointer-to-int-cast on compile test Store in interrupt service routine always '1' in end_command, not the value of host->cmd to fix compile test warnings on RISC-V: drivers/mmc/host/davinci_mmc.c:999:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902204847.2764-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/davinci_mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index fad1010fb52b..66d740ee7d45 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -996,7 +996,7 @@ static irqreturn_t mmc_davinci_irq(int irq, void *dev_id) if (qstatus & MMCST0_RSPDNE) { /* End of command phase */ - end_command = (int) host->cmd; + end_command = host->cmd ? 1 : 0; } if (end_command) From da4599992d3eff5e52a2b32f6e4099f19b1dd655 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 22:48:46 +0200 Subject: [PATCH 056/271] mmc: s3cmci: Use proper printk format for iomem pointer iomem pointers should be printed with pointer format to hide the actual value and fix warnings when compile testing for 64-bit architecture: drivers/mmc/host/s3cmci.c:1355:46: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902204847.2764-2-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/s3cmci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index b5df948f8155..477dcf417d4d 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -1376,7 +1376,7 @@ static int s3cmci_state_show(struct seq_file *seq, void *v) { struct s3cmci_host *host = seq->private; - seq_printf(seq, "Register base = 0x%08x\n", (u32)host->base); + seq_printf(seq, "Register base = 0x%p\n", host->base); seq_printf(seq, "Clock rate = %ld\n", host->clk_rate); seq_printf(seq, "Prescale = %d\n", host->prescaler); seq_printf(seq, "is2440 = %d\n", host->is2440); From 6e7fb09e1924927149b68e309d6524de777e54c2 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 2 Sep 2020 22:48:47 +0200 Subject: [PATCH 057/271] mmc: s3cmci: Cast driver data through long Since driver data is a pointer, direct casting to integer causes warning when compile testing for 64-bit architecture: drivers/mmc/host/s3cmci.c:1495:17: warning: cast from pointer to integer of different size [-Wpointer-to-int-cast] The actual driver data can be only 0 or 1, so cast it via long and do not care about any loss of value. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200902204847.2764-3-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/s3cmci.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index 477dcf417d4d..704103a671e7 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -1519,7 +1519,7 @@ static int s3cmci_probe_dt(struct s3cmci_host *host) struct mmc_host *mmc = host->mmc; int ret; - host->is2440 = (int) of_device_get_match_data(&pdev->dev); + host->is2440 = (long) of_device_get_match_data(&pdev->dev); ret = mmc_of_parse(mmc); if (ret) From 4eabf0f20a2aa6e1e099faa816c4b9e9a2155891 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Thu, 3 Sep 2020 07:43:33 +0200 Subject: [PATCH 058/271] mmc: s3cmci: Drop unused variables in dbg_dumpregs The 'imask' and 'bsize' are not used in dbg_dumpregs: drivers/mmc/host/s3cmci.c:149:36: warning: variable 'imask' set but not used [-Wunused-but-set-variable] drivers/mmc/host/s3cmci.c:148:63: warning: variable 'bsize' set but not used [-Wunused-but-set-variable] Reported-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200903054333.18331-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/s3cmci.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index 704103a671e7..a725fb1abce7 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -150,8 +150,8 @@ static void s3cmci_reset(struct s3cmci_host *host); static void dbg_dumpregs(struct s3cmci_host *host, char *prefix) { - u32 con, pre, cmdarg, cmdcon, cmdsta, r0, r1, r2, r3, timer, bsize; - u32 datcon, datcnt, datsta, fsta, imask; + u32 con, pre, cmdarg, cmdcon, cmdsta, r0, r1, r2, r3, timer; + u32 datcon, datcnt, datsta, fsta; con = readl(host->base + S3C2410_SDICON); pre = readl(host->base + S3C2410_SDIPRE); @@ -163,12 +163,10 @@ static void dbg_dumpregs(struct s3cmci_host *host, char *prefix) r2 = readl(host->base + S3C2410_SDIRSP2); r3 = readl(host->base + S3C2410_SDIRSP3); timer = readl(host->base + S3C2410_SDITIMER); - bsize = readl(host->base + S3C2410_SDIBSIZE); datcon = readl(host->base + S3C2410_SDIDCON); datcnt = readl(host->base + S3C2410_SDIDCNT); datsta = readl(host->base + S3C2410_SDIDSTA); fsta = readl(host->base + S3C2410_SDIFSTA); - imask = readl(host->base + host->sdiimsk); dbg(host, dbg_debug, "%s CON:[%08x] PRE:[%08x] TMR:[%08x]\n", prefix, con, pre, timer); From 8c98644bfc45516c81dadb99ebd0f8461688add3 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Wed, 2 Sep 2020 16:43:15 -0700 Subject: [PATCH 059/271] mmc: sdhci-msm: Prefer asynchronous probe Turning on initcall debug on one system showed this: initcall sdhci_msm_driver_init+0x0/0x28 returned 0 after 34782 usecs The lion's share of this time (~33 ms) was in mmc_power_up(). This shouldn't be terribly surprising since there are a few calls to delay based on "power_delay_ms" and the default delay there is 10 ms. Because we haven't specified that we'd prefer asynchronous probe for this driver then we'll wait for this driver to finish before we start probes for more drivers. While 33 ms doesn't sound like tons, every little bit counts. There should be little problem with turning on asynchronous probe for this driver. It's already possible that previous drivers may have turned on asynchronous probe so we might already have other things (that probed before us) probing at the same time we are anyway. This driver isn't really providing resources (clocks, regulators, etc) that other drivers need to probe and even if it was they should be handling -EPROBE_DEFER. Let's turn this on and get a bit of boot speed back. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200902164303.1.I5e598a25222b4534c0083b61dbfa4e0e76f66171@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index d4c02884cca8..9dd0dbb65382 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -2542,6 +2542,7 @@ static struct platform_driver sdhci_msm_driver = { .name = "sdhci_msm", .of_match_table = sdhci_msm_dt_match, .pm = &sdhci_msm_pm_ops, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; From 24fdcb377892b44dfe9755a62e4b92e9c6a0b359 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Sep 2020 17:02:47 +0200 Subject: [PATCH 060/271] mmc: core: when downgrading HS400, callback into drivers earlier The driver specific downgrade function makes more sense if we run it before we set the timing to something lower, not after. Otherwise some non-HS400 communication has already happened. No need to convert users. There is only one currently which needs this change in a following patch. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200901150250.26236-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index b3fa193de846..ba2852b684b1 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1168,13 +1168,13 @@ static int mmc_select_hs400(struct mmc_card *card) return err; } - /* Set host controller to HS timing */ - mmc_set_timing(card->host, MMC_TIMING_MMC_HS); - /* Prepare host to downgrade to HS timing */ if (host->ops->hs400_downgrade) host->ops->hs400_downgrade(host); + /* Set host controller to HS timing */ + mmc_set_timing(card->host, MMC_TIMING_MMC_HS); + /* Reduce frequency to HS frequency */ max_dtr = card->ext_csd.hs_max_dtr; mmc_set_clock(host, max_dtr); @@ -1253,6 +1253,9 @@ int mmc_hs400_to_hs200(struct mmc_card *card) if (err) goto out_err; + if (host->ops->hs400_downgrade) + host->ops->hs400_downgrade(host); + mmc_set_timing(host, MMC_TIMING_MMC_DDR52); err = mmc_switch_status(card, true); @@ -1268,9 +1271,6 @@ int mmc_hs400_to_hs200(struct mmc_card *card) mmc_set_timing(host, MMC_TIMING_MMC_HS); - if (host->ops->hs400_downgrade) - host->ops->hs400_downgrade(host); - err = mmc_switch_status(card, true); if (err) goto out_err; From 97a7d87e96b02fc5b3944d7735e0f6b8446d07da Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Sep 2020 17:02:48 +0200 Subject: [PATCH 061/271] mmc: core: add a 'doing_init_tune' flag and a 'mmc_doing_tune' helper Our driver needs to know when tuning is in progress. 'doing_retune' only covers re-tuning, not the initial tuning. Add another flag to detect the initial tuning state and add a helper which tells us if any kind of tuning is going on. Only implemented for MMC currently because that's where we need it. SD can be added later if it becomes necessary. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200901150250.26236-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 4 ++++ include/linux/mmc/host.h | 6 ++++++ 2 files changed, 10 insertions(+) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index ba2852b684b1..216bd1aed373 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1763,6 +1763,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, goto free_card; if (mmc_card_hs200(card)) { + host->doing_init_tune = 1; + err = mmc_hs200_tuning(card); if (err) goto free_card; @@ -1770,6 +1772,8 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, err = mmc_select_hs400(card); if (err) goto free_card; + + host->doing_init_tune = 0; } else if (!mmc_card_hs400es(card)) { /* Select the desired bus width optionally */ err = mmc_select_bus_width(card); diff --git a/include/linux/mmc/host.h b/include/linux/mmc/host.h index 799e23b0a23c..c079b932330f 100644 --- a/include/linux/mmc/host.h +++ b/include/linux/mmc/host.h @@ -400,6 +400,7 @@ struct mmc_host { unsigned int use_spi_crc:1; unsigned int claimed:1; /* host exclusively claimed */ unsigned int bus_dead:1; /* bus has been released */ + unsigned int doing_init_tune:1; /* initial tuning in progress */ unsigned int can_retune:1; /* re-tuning can be used */ unsigned int doing_retune:1; /* re-tuning in progress */ unsigned int retune_now:1; /* do re-tuning at next req */ @@ -595,6 +596,11 @@ static inline bool mmc_doing_retune(struct mmc_host *host) return host->doing_retune == 1; } +static inline bool mmc_doing_tune(struct mmc_host *host) +{ + return host->doing_retune == 1 || host->doing_init_tune == 1; +} + static inline enum dma_data_direction mmc_get_dma_dir(struct mmc_data *data) { return data->flags & MMC_DATA_WRITE ? DMA_TO_DEVICE : DMA_FROM_DEVICE; From 0f93db6542fa94262b611ff942b9b3ad7ecbea10 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Sep 2020 17:02:49 +0200 Subject: [PATCH 062/271] mmc: renesas_sdhi: keep SCC clock active when tuning Tuning procedure switches to lower frequencies but that will turn the SCC off and accessing its register then will hang. So, check when we are tuning and keep the current setup of the external clock if we are doing so. Note that we still switch to the lower frequency because of the internal divider. We just make sure to not modify the external clock. This patch depends on a MMC core patch calling the downgrade function earlier. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200901150250.26236-4-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 1243c42d3d1b..42eaec40a6d9 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -117,8 +117,12 @@ static unsigned int renesas_sdhi_clk_update(struct tmio_mmc_host *host, unsigned int freq, diff, best_freq = 0, diff_min = ~0; int i; - /* tested only on R-Car Gen2+ currently; may work for others */ - if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2)) + /* + * We simply return the current rate if a) we are not on a R-Car Gen2+ + * SoC (may work for others, but untested) or b) if the SCC needs its + * clock during tuning, so we don't change the external clock setup. + */ + if (!(host->pdata->flags & TMIO_MMC_MIN_RCAR2) || mmc_doing_tune(host->mmc)) return clk_get_rate(priv->clk); /* From 452f553e272cf010ee1ac2d4896b265a6fbf4e29 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 1 Sep 2020 17:02:50 +0200 Subject: [PATCH 063/271] mmc: core: simplify an expression We already have 'host' as a variable, so use it. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200901150250.26236-5-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 216bd1aed373..67e95eba0e82 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1173,7 +1173,7 @@ static int mmc_select_hs400(struct mmc_card *card) host->ops->hs400_downgrade(host); /* Set host controller to HS timing */ - mmc_set_timing(card->host, MMC_TIMING_MMC_HS); + mmc_set_timing(host, MMC_TIMING_MMC_HS); /* Reduce frequency to HS frequency */ max_dtr = card->ext_csd.hs_max_dtr; From 77811ffa6f66ae4051451b0a9ebb22b2b1fdd466 Mon Sep 17 00:00:00 2001 From: Xu Wang Date: Thu, 3 Sep 2020 08:48:25 +0000 Subject: [PATCH 064/271] mmc: omap-hsmmc: remove redundant null check Because clk_disable_unprepare already checked NULL clock parameter, so the additional checks are unnecessary, just remove them. Signed-off-by: Xu Wang Link: https://lore.kernel.org/r/20200903084825.85616-1-vulab@iscas.ac.cn Signed-off-by: Ulf Hansson --- drivers/mmc/host/omap_hsmmc.c | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index 37b8740513f5..d02983e23ed1 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -1114,8 +1114,7 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) int ret; /* Disable the clocks */ - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); /* Turn the power off */ ret = omap_hsmmc_set_power(host, 0); @@ -1123,8 +1122,7 @@ static int omap_hsmmc_switch_opcond(struct omap_hsmmc_host *host, int vdd) /* Turn the power ON with given VDD 1.8 or 3.0v */ if (!ret) ret = omap_hsmmc_set_power(host, 1); - if (host->dbclk) - clk_prepare_enable(host->dbclk); + clk_prepare_enable(host->dbclk); if (ret != 0) goto err; @@ -2014,8 +2012,7 @@ static int omap_hsmmc_probe(struct platform_device *pdev) pm_runtime_dont_use_autosuspend(host->dev); pm_runtime_put_sync(host->dev); pm_runtime_disable(host->dev); - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); err1: mmc_free_host(mmc); err: @@ -2037,8 +2034,7 @@ static int omap_hsmmc_remove(struct platform_device *pdev) pm_runtime_put_sync(host->dev); pm_runtime_disable(host->dev); device_init_wakeup(&pdev->dev, false); - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); mmc_free_host(host->mmc); @@ -2063,8 +2059,7 @@ static int omap_hsmmc_suspend(struct device *dev) OMAP_HSMMC_READ(host->base, HCTL) & ~SDBP); } - if (host->dbclk) - clk_disable_unprepare(host->dbclk); + clk_disable_unprepare(host->dbclk); pm_runtime_put_sync(host->dev); return 0; @@ -2080,8 +2075,7 @@ static int omap_hsmmc_resume(struct device *dev) pm_runtime_get_sync(host->dev); - if (host->dbclk) - clk_prepare_enable(host->dbclk); + clk_prepare_enable(host->dbclk); if (!(host->mmc->pm_flags & MMC_PM_KEEP_POWER)) omap_hsmmc_conf_bus_power(host); From 55118e5358343c74b7a607d5b9251df8acc3fca9 Mon Sep 17 00:00:00 2001 From: Robin Murphy Date: Thu, 3 Sep 2020 22:18:06 +0100 Subject: [PATCH 065/271] mmc: renesas_sdhi: Drop local dma_parms Since commit 9495b7e92f71 ("driver core: platform: Initialize dma_parms for platform devices"), struct platform_device already provides a dma_parms structure, so we can save allocating another one. Signed-off-by: Robin Murphy Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang Link: https://lore.kernel.org/r/85e1fc97dbec3dea96102785a5e308ccb5e91cfe.1599167798.git.robin.murphy@arm.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_internal_dmac.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index 32ab991544ef..8ee6b2b85ec2 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -336,10 +336,6 @@ static int renesas_sdhi_internal_dmac_probe(struct platform_device *pdev) if (soc) global_flags |= (unsigned long)soc->data; - dev->dma_parms = devm_kzalloc(dev, sizeof(*dev->dma_parms), GFP_KERNEL); - if (!dev->dma_parms) - return -ENOMEM; - /* value is max of SD_SECCNT. Confirmed by HW engineers */ dma_set_max_seg_size(dev, 0xffffffff); From f5d373ec2822fa0f3d52cec9a086115140e7d4e7 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 4 Sep 2020 18:43:14 +0200 Subject: [PATCH 066/271] mmc: host: Drop unneeded MMC dependency in Kconfig MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit All entries in Kconfig are already part of "if MMC", so there is no need for additional dependency on MMC. Suggested-by: Michał Mirosław Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200904164315.24618-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 0af8b12a8f7d..ddbed6f41c41 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -422,7 +422,7 @@ config MMC_SDHCI_IPROC config MMC_MESON_GX tristate "Amlogic S905/GX*/AXG SD/MMC Host Controller support" - depends on ARCH_MESON && MMC + depends on ARCH_MESON help This selects support for the Amlogic SD/MMC Host Controller found on the S905/GX*/AXG family of SoCs. This controller is @@ -458,7 +458,7 @@ config MMC_MESON_MX_SDIO config MMC_MOXART tristate "MOXART SD/MMC Host Controller support" - depends on ARCH_MOXART && MMC + depends on ARCH_MOXART help This selects support for the MOXART SD/MMC Host Controller. MOXA provides one multi-functional card reader which can From 54d8454436a205682bd89d66d8d9eedbc8452d15 Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Mon, 7 Sep 2020 12:52:54 +0200 Subject: [PATCH 067/271] mmc: host: Enable compile testing of multiple drivers Multiple MMC host controller driver can be compile tested as they do not depend on architecture specific headers. Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200907105254.31097-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index ddbed6f41c41..dc646359b4ff 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -178,7 +178,7 @@ config MMC_SDHCI_OF_AT91 config MMC_SDHCI_OF_ESDHC tristate "SDHCI OF support for the Freescale eSDHC controller" depends on MMC_SDHCI_PLTFM - depends on PPC || ARCH_MXC || ARCH_LAYERSCAPE + depends on PPC || ARCH_MXC || ARCH_LAYERSCAPE || COMPILE_TEST select MMC_SDHCI_IO_ACCESSORS select FSL_GUTS help @@ -216,7 +216,7 @@ config MMC_SDHCI_OF_DWCMSHC config MMC_SDHCI_OF_SPARX5 tristate "SDHCI OF support for the MCHP Sparx5 SoC" depends on MMC_SDHCI_PLTFM - depends on ARCH_SPARX5 + depends on ARCH_SPARX5 || COMPILE_TEST help This selects the Secure Digital Host Controller Interface (SDHCI) found in the MCHP Sparx5 SoC. @@ -238,7 +238,7 @@ config MMC_SDHCI_CADENCE config MMC_SDHCI_CNS3XXX tristate "SDHCI support on the Cavium Networks CNS3xxx SoC" - depends on ARCH_CNS3XXX + depends on ARCH_CNS3XXX || COMPILE_TEST depends on MMC_SDHCI_PLTFM help This selects the SDHCI support for CNS3xxx System-on-Chip devices. @@ -262,7 +262,7 @@ config MMC_SDHCI_ESDHC_MCF config MMC_SDHCI_ESDHC_IMX tristate "SDHCI support for the Freescale eSDHC/uSDHC i.MX controller" - depends on ARCH_MXC + depends on ARCH_MXC || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI @@ -276,7 +276,7 @@ config MMC_SDHCI_ESDHC_IMX config MMC_SDHCI_DOVE tristate "SDHCI support on Marvell's Dove SoC" - depends on ARCH_DOVE || MACH_DOVE + depends on ARCH_DOVE || MACH_DOVE || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -289,7 +289,7 @@ config MMC_SDHCI_DOVE config MMC_SDHCI_TEGRA tristate "SDHCI platform support for the Tegra SD/MMC Controller" - depends on ARCH_TEGRA + depends on ARCH_TEGRA || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_CQHCI @@ -301,7 +301,8 @@ config MMC_SDHCI_TEGRA config MMC_SDHCI_S3C tristate "SDHCI support on Samsung S3C SoC" - depends on MMC_SDHCI && PLAT_SAMSUNG + depends on MMC_SDHCI + depends on PLAT_SAMSUNG || COMPILE_TEST help This selects the Secure Digital Host Controller Interface (SDHCI) often referrered to as the HSMMC block in some of the Samsung S3C @@ -313,7 +314,7 @@ config MMC_SDHCI_S3C config MMC_SDHCI_SIRF tristate "SDHCI support on CSR SiRFprimaII and SiRFmarco SoCs" - depends on ARCH_SIRF + depends on ARCH_SIRF || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -351,7 +352,8 @@ config MMC_SDHCI_PXAV2 config MMC_SDHCI_SPEAR tristate "SDHCI support on ST SPEAr platform" - depends on MMC_SDHCI && PLAT_SPEAR + depends on MMC_SDHCI + depends on PLAT_SPEAR || COMPILE_TEST depends on OF help This selects the Secure Digital Host Controller Interface (SDHCI) @@ -374,7 +376,7 @@ config MMC_SDHCI_S3C_DMA config MMC_SDHCI_BCM_KONA tristate "SDHCI support on Broadcom KONA platform" - depends on ARCH_BCM_MOBILE + depends on ARCH_BCM_MOBILE || COMPILE_TEST depends on MMC_SDHCI_PLTFM help This selects the Broadcom Kona Secure Digital Host Controller @@ -422,7 +424,7 @@ config MMC_SDHCI_IPROC config MMC_MESON_GX tristate "Amlogic S905/GX*/AXG SD/MMC Host Controller support" - depends on ARCH_MESON + depends on ARCH_MESON|| COMPILE_TEST help This selects support for the Amlogic SD/MMC Host Controller found on the S905/GX*/AXG family of SoCs. This controller is @@ -458,7 +460,7 @@ config MMC_MESON_MX_SDIO config MMC_MOXART tristate "MOXART SD/MMC Host Controller support" - depends on ARCH_MOXART + depends on ARCH_MOXART || COMPILE_TEST help This selects support for the MOXART SD/MMC Host Controller. MOXA provides one multi-functional card reader which can @@ -467,7 +469,7 @@ config MMC_MOXART config MMC_SDHCI_ST tristate "SDHCI support on STMicroelectronics SoC" - depends on ARCH_STI || FSP2 + depends on ARCH_STI || FSP2 || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS help @@ -587,7 +589,7 @@ config MMC_TIFM_SD config MMC_MVSDIO tristate "Marvell MMC/SD/SDIO host driver" - depends on PLAT_ORION + depends on PLAT_ORION || (COMPILE_TEST && ARM) depends on OF help This selects the Marvell SDIO host driver. @@ -599,7 +601,7 @@ config MMC_MVSDIO config MMC_DAVINCI tristate "TI DAVINCI Multimedia Card Interface support" - depends on ARCH_DAVINCI + depends on ARCH_DAVINCI || COMPILE_TEST help This selects the TI DAVINCI Multimedia card Interface. If you have an DAVINCI board with a Multimedia Card slot, @@ -628,7 +630,7 @@ config MMC_SPI config MMC_S3C tristate "Samsung S3C SD/MMC Card Interface support" - depends on ARCH_S3C24XX + depends on ARCH_S3C24XX || COMPILE_TEST depends on S3C24XX_DMAC help This selects a driver for the MCI interface found in @@ -681,7 +683,7 @@ config MMC_SDRICOH_CS config MMC_SDHCI_SPRD tristate "Spreadtrum SDIO host Controller" - depends on ARCH_SPRD + depends on ARCH_SPRD || COMPILE_TEST depends on MMC_SDHCI_PLTFM select MMC_SDHCI_IO_ACCESSORS select MMC_HSQ @@ -698,7 +700,7 @@ config MMC_TMIO_CORE config MMC_TMIO tristate "Toshiba Mobile IO Controller (TMIO) MMC/SD function support" - depends on MFD_TMIO || MFD_ASIC3 + depends on MFD_TMIO || MFD_ASIC3 || COMPILE_TEST select MMC_TMIO_CORE help This provides support for the SD/MMC cell found in TC6393XB, @@ -971,7 +973,7 @@ config MMC_REALTEK_USB config MMC_SUNXI tristate "Allwinner sunxi SD/MMC Host Controller support" - depends on ARCH_SUNXI + depends on ARCH_SUNXI || COMPILE_TEST help This selects support for the SD/MMC Host Controller on Allwinner sunxi SoCs. From 21b2cec61c04bd175f0860d9411a472d5a0e7ba1 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 3 Sep 2020 16:24:36 -0700 Subject: [PATCH 068/271] mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v4.4 This is like commit 3d3451124f3d ("mmc: sdhci-msm: Prefer asynchronous probe") but applied to a whole pile of drivers. This batch converts the drivers that appeared to be around in the v4.4 timeframe. Signed-off-by: Douglas Anderson Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang # SH_MMCIF Tested-by: Thierry Reding Link: https://lore.kernel.org/r/20200903162412.1.Id501e96fa63224f77bb86b2135a5e8324ffb9c43@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/android-goldfish.c | 1 + drivers/mmc/host/atmel-mci.c | 1 + drivers/mmc/host/au1xmmc.c | 1 + drivers/mmc/host/davinci_mmc.c | 1 + drivers/mmc/host/dw_mmc-exynos.c | 1 + drivers/mmc/host/dw_mmc-k3.c | 1 + drivers/mmc/host/dw_mmc-pltfm.c | 1 + drivers/mmc/host/dw_mmc-rockchip.c | 1 + drivers/mmc/host/jz4740_mmc.c | 1 + drivers/mmc/host/moxart-mmc.c | 1 + drivers/mmc/host/mtk-sd.c | 1 + drivers/mmc/host/mvsdio.c | 1 + drivers/mmc/host/mxcmmc.c | 1 + drivers/mmc/host/mxs-mmc.c | 1 + drivers/mmc/host/omap.c | 1 + drivers/mmc/host/omap_hsmmc.c | 1 + drivers/mmc/host/pxamci.c | 1 + drivers/mmc/host/rtsx_pci_sdmmc.c | 1 + drivers/mmc/host/rtsx_usb_sdmmc.c | 1 + drivers/mmc/host/s3cmci.c | 1 + drivers/mmc/host/sdhci-acpi.c | 1 + drivers/mmc/host/sdhci-bcm-kona.c | 1 + drivers/mmc/host/sdhci-cns3xxx.c | 1 + drivers/mmc/host/sdhci-dove.c | 1 + drivers/mmc/host/sdhci-esdhc-imx.c | 1 + drivers/mmc/host/sdhci-iproc.c | 1 + drivers/mmc/host/sdhci-of-arasan.c | 1 + drivers/mmc/host/sdhci-of-at91.c | 1 + drivers/mmc/host/sdhci-of-esdhc.c | 1 + drivers/mmc/host/sdhci-of-hlwd.c | 1 + drivers/mmc/host/sdhci-pxav2.c | 1 + drivers/mmc/host/sdhci-pxav3.c | 1 + drivers/mmc/host/sdhci-s3c.c | 1 + drivers/mmc/host/sdhci-sirf.c | 1 + drivers/mmc/host/sdhci-spear.c | 1 + drivers/mmc/host/sdhci-st.c | 1 + drivers/mmc/host/sdhci-tegra.c | 1 + drivers/mmc/host/sdhci_f_sdh30.c | 1 + drivers/mmc/host/sh_mmcif.c | 1 + drivers/mmc/host/sunxi-mmc.c | 1 + drivers/mmc/host/tmio_mmc.c | 1 + drivers/mmc/host/usdhi6rol0.c | 1 + drivers/mmc/host/wbsd.c | 1 + drivers/mmc/host/wmt-sdmmc.c | 1 + 44 files changed, 44 insertions(+) diff --git a/drivers/mmc/host/android-goldfish.c b/drivers/mmc/host/android-goldfish.c index ceb4924e02d0..e878fdf8f20a 100644 --- a/drivers/mmc/host/android-goldfish.c +++ b/drivers/mmc/host/android-goldfish.c @@ -537,6 +537,7 @@ static struct platform_driver goldfish_mmc_driver = { .remove = goldfish_mmc_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/atmel-mci.c b/drivers/mmc/host/atmel-mci.c index 3fc3bbea8536..444bd3a0a922 100644 --- a/drivers/mmc/host/atmel-mci.c +++ b/drivers/mmc/host/atmel-mci.c @@ -2668,6 +2668,7 @@ static struct platform_driver atmci_driver = { .remove = atmci_remove, .driver = { .name = "atmel_mci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(atmci_dt_ids), .pm = &atmci_dev_pm_ops, }, diff --git a/drivers/mmc/host/au1xmmc.c b/drivers/mmc/host/au1xmmc.c index 9bb1910268ca..bd00515fbaba 100644 --- a/drivers/mmc/host/au1xmmc.c +++ b/drivers/mmc/host/au1xmmc.c @@ -1189,6 +1189,7 @@ static struct platform_driver au1xmmc_driver = { .resume = au1xmmc_resume, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/davinci_mmc.c b/drivers/mmc/host/davinci_mmc.c index 66d740ee7d45..90cd179625fc 100644 --- a/drivers/mmc/host/davinci_mmc.c +++ b/drivers/mmc/host/davinci_mmc.c @@ -1395,6 +1395,7 @@ static const struct dev_pm_ops davinci_mmcsd_pm = { static struct platform_driver davinci_mmcsd_driver = { .driver = { .name = "davinci_mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = davinci_mmcsd_pm_ops, .of_match_table = davinci_mmc_dt_ids, }, diff --git a/drivers/mmc/host/dw_mmc-exynos.c b/drivers/mmc/host/dw_mmc-exynos.c index 95adeee07217..0c75810812a0 100644 --- a/drivers/mmc/host/dw_mmc-exynos.c +++ b/drivers/mmc/host/dw_mmc-exynos.c @@ -592,6 +592,7 @@ static struct platform_driver dw_mci_exynos_pltfm_driver = { .remove = dw_mci_exynos_remove, .driver = { .name = "dwmmc_exynos", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_exynos_match, .pm = &dw_mci_exynos_pmops, }, diff --git a/drivers/mmc/host/dw_mmc-k3.c b/drivers/mmc/host/dw_mmc-k3.c index db1a84b2ba61..29d2494eb27a 100644 --- a/drivers/mmc/host/dw_mmc-k3.c +++ b/drivers/mmc/host/dw_mmc-k3.c @@ -473,6 +473,7 @@ static struct platform_driver dw_mci_k3_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dwmmc_k3", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_k3_match, .pm = &dw_mci_k3_dev_pm_ops, }, diff --git a/drivers/mmc/host/dw_mmc-pltfm.c b/drivers/mmc/host/dw_mmc-pltfm.c index 7de37f524a96..73731cd3ba23 100644 --- a/drivers/mmc/host/dw_mmc-pltfm.c +++ b/drivers/mmc/host/dw_mmc-pltfm.c @@ -98,6 +98,7 @@ static struct platform_driver dw_mci_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dw_mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_pltfm_match, .pm = &dw_mci_pltfm_pmops, }, diff --git a/drivers/mmc/host/dw_mmc-rockchip.c b/drivers/mmc/host/dw_mmc-rockchip.c index d4d02134848c..753502ce3c85 100644 --- a/drivers/mmc/host/dw_mmc-rockchip.c +++ b/drivers/mmc/host/dw_mmc-rockchip.c @@ -383,6 +383,7 @@ static struct platform_driver dw_mci_rockchip_pltfm_driver = { .remove = dw_mci_rockchip_remove, .driver = { .name = "dwmmc_rockchip", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_rockchip_match, .pm = &dw_mci_rockchip_dev_pm_ops, }, diff --git a/drivers/mmc/host/jz4740_mmc.c b/drivers/mmc/host/jz4740_mmc.c index fea0a42010c7..a1f92fed2a55 100644 --- a/drivers/mmc/host/jz4740_mmc.c +++ b/drivers/mmc/host/jz4740_mmc.c @@ -1124,6 +1124,7 @@ static struct platform_driver jz4740_mmc_driver = { .remove = jz4740_mmc_remove, .driver = { .name = "jz4740-mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(jz4740_mmc_of_match), .pm = pm_ptr(&jz4740_mmc_pm_ops), }, diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index fc6b9cf27d0b..2bfb376fddc4 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -717,6 +717,7 @@ static struct platform_driver moxart_mmc_driver = { .remove = moxart_remove, .driver = { .name = "mmc-moxart", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = moxart_mmc_match, }, }; diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index ed2b24691b4f..691c928edab7 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -2689,6 +2689,7 @@ static struct platform_driver mt_msdc_driver = { .remove = msdc_drv_remove, .driver = { .name = "mtk-msdc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = msdc_of_ids, .pm = &msdc_dev_pm_ops, }, diff --git a/drivers/mmc/host/mvsdio.c b/drivers/mmc/host/mvsdio.c index cc0752a9df6d..629efbe639c4 100644 --- a/drivers/mmc/host/mvsdio.c +++ b/drivers/mmc/host/mvsdio.c @@ -824,6 +824,7 @@ static struct platform_driver mvsd_driver = { .remove = mvsd_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = mvsdio_dt_ids, }, }; diff --git a/drivers/mmc/host/mxcmmc.c b/drivers/mmc/host/mxcmmc.c index b3d654c688e5..12ee07285980 100644 --- a/drivers/mmc/host/mxcmmc.c +++ b/drivers/mmc/host/mxcmmc.c @@ -1244,6 +1244,7 @@ static struct platform_driver mxcmci_driver = { .id_table = mxcmci_devtype, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &mxcmci_pm_ops, .of_match_table = mxcmci_of_match, } diff --git a/drivers/mmc/host/mxs-mmc.c b/drivers/mmc/host/mxs-mmc.c index b1820def36c0..75007f61df97 100644 --- a/drivers/mmc/host/mxs-mmc.c +++ b/drivers/mmc/host/mxs-mmc.c @@ -726,6 +726,7 @@ static struct platform_driver mxs_mmc_driver = { .id_table = mxs_ssp_ids, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &mxs_mmc_pm_ops, .of_match_table = mxs_mmc_dt_ids, }, diff --git a/drivers/mmc/host/omap.c b/drivers/mmc/host/omap.c index 33d7af7c7762..6aa0537f1f84 100644 --- a/drivers/mmc/host/omap.c +++ b/drivers/mmc/host/omap.c @@ -1504,6 +1504,7 @@ static struct platform_driver mmc_omap_driver = { .remove = mmc_omap_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(mmc_omap_match), }, }; diff --git a/drivers/mmc/host/omap_hsmmc.c b/drivers/mmc/host/omap_hsmmc.c index d02983e23ed1..aa9cc49206d1 100644 --- a/drivers/mmc/host/omap_hsmmc.c +++ b/drivers/mmc/host/omap_hsmmc.c @@ -2165,6 +2165,7 @@ static struct platform_driver omap_hsmmc_driver = { .remove = omap_hsmmc_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &omap_hsmmc_dev_pm_ops, .of_match_table = of_match_ptr(omap_mmc_of_match), }, diff --git a/drivers/mmc/host/pxamci.c b/drivers/mmc/host/pxamci.c index 3a9333475a2b..29f6180a0036 100644 --- a/drivers/mmc/host/pxamci.c +++ b/drivers/mmc/host/pxamci.c @@ -811,6 +811,7 @@ static struct platform_driver pxamci_driver = { .remove = pxamci_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(pxa_mmc_dt_ids), }, }; diff --git a/drivers/mmc/host/rtsx_pci_sdmmc.c b/drivers/mmc/host/rtsx_pci_sdmmc.c index 2763a376b054..eb395e144207 100644 --- a/drivers/mmc/host/rtsx_pci_sdmmc.c +++ b/drivers/mmc/host/rtsx_pci_sdmmc.c @@ -1471,6 +1471,7 @@ static struct platform_driver rtsx_pci_sdmmc_driver = { .id_table = rtsx_pci_sdmmc_ids, .driver = { .name = DRV_NAME_RTSX_PCI_SDMMC, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; module_platform_driver(rtsx_pci_sdmmc_driver); diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index 7225d9312af8..598f49573f5d 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -1458,6 +1458,7 @@ static struct platform_driver rtsx_usb_sdmmc_driver = { .id_table = rtsx_usb_sdmmc_ids, .driver = { .name = "rtsx_usb_sdmmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &rtsx_usb_sdmmc_dev_pm_ops, }, }; diff --git a/drivers/mmc/host/s3cmci.c b/drivers/mmc/host/s3cmci.c index a725fb1abce7..e3698aba8dd3 100644 --- a/drivers/mmc/host/s3cmci.c +++ b/drivers/mmc/host/s3cmci.c @@ -1804,6 +1804,7 @@ MODULE_DEVICE_TABLE(platform, s3cmci_driver_ids); static struct platform_driver s3cmci_driver = { .driver = { .name = "s3c-sdi", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = s3cmci_dt_match, }, .id_table = s3cmci_driver_ids, diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index 284cba11e279..fc41d0451f20 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -1027,6 +1027,7 @@ static const struct dev_pm_ops sdhci_acpi_pm_ops = { static struct platform_driver sdhci_acpi_driver = { .driver = { .name = "sdhci-acpi", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .acpi_match_table = sdhci_acpi_ids, .pm = &sdhci_acpi_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-bcm-kona.c b/drivers/mmc/host/sdhci-bcm-kona.c index a6c2bd202b45..4d4aac85cc7a 100644 --- a/drivers/mmc/host/sdhci-bcm-kona.c +++ b/drivers/mmc/host/sdhci-bcm-kona.c @@ -324,6 +324,7 @@ static int sdhci_bcm_kona_probe(struct platform_device *pdev) static struct platform_driver sdhci_bcm_kona_driver = { .driver = { .name = "sdhci-kona", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = sdhci_bcm_kona_of_match, }, diff --git a/drivers/mmc/host/sdhci-cns3xxx.c b/drivers/mmc/host/sdhci-cns3xxx.c index 811eab1b8964..2a29c7a4f308 100644 --- a/drivers/mmc/host/sdhci-cns3xxx.c +++ b/drivers/mmc/host/sdhci-cns3xxx.c @@ -98,6 +98,7 @@ static int sdhci_cns3xxx_probe(struct platform_device *pdev) static struct platform_driver sdhci_cns3xxx_driver = { .driver = { .name = "sdhci-cns3xxx", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, }, .probe = sdhci_cns3xxx_probe, diff --git a/drivers/mmc/host/sdhci-dove.c b/drivers/mmc/host/sdhci-dove.c index fe9da3122fe9..5e5bf82e5976 100644 --- a/drivers/mmc/host/sdhci-dove.c +++ b/drivers/mmc/host/sdhci-dove.c @@ -105,6 +105,7 @@ MODULE_DEVICE_TABLE(of, sdhci_dove_of_match_table); static struct platform_driver sdhci_dove_driver = { .driver = { .name = "sdhci-dove", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = sdhci_dove_of_match_table, }, diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index a2eb36c9b1da..62b3d4767916 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1925,6 +1925,7 @@ static const struct dev_pm_ops sdhci_esdhc_pmops = { static struct platform_driver sdhci_esdhc_imx_driver = { .driver = { .name = "sdhci-esdhc-imx", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = imx_esdhc_dt_ids, .pm = &sdhci_esdhc_pmops, }, diff --git a/drivers/mmc/host/sdhci-iproc.c b/drivers/mmc/host/sdhci-iproc.c index b540aa6faacb..c9434b461aab 100644 --- a/drivers/mmc/host/sdhci-iproc.c +++ b/drivers/mmc/host/sdhci-iproc.c @@ -369,6 +369,7 @@ static int sdhci_iproc_probe(struct platform_device *pdev) static struct platform_driver sdhci_iproc_driver = { .driver = { .name = "sdhci-iproc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_iproc_of_match, .acpi_match_table = ACPI_PTR(sdhci_iproc_acpi_ids), .pm = &sdhci_pltfm_pmops, diff --git a/drivers/mmc/host/sdhci-of-arasan.c b/drivers/mmc/host/sdhci-of-arasan.c index 3aea46db2ea7..829ccef87426 100644 --- a/drivers/mmc/host/sdhci-of-arasan.c +++ b/drivers/mmc/host/sdhci-of-arasan.c @@ -1693,6 +1693,7 @@ static int sdhci_arasan_remove(struct platform_device *pdev) static struct platform_driver sdhci_arasan_driver = { .driver = { .name = "sdhci-arasan", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_arasan_of_match, .pm = &sdhci_arasan_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-of-at91.c b/drivers/mmc/host/sdhci-of-at91.c index 1ece2c50042c..5564d7b23e7c 100644 --- a/drivers/mmc/host/sdhci-of-at91.c +++ b/drivers/mmc/host/sdhci-of-at91.c @@ -465,6 +465,7 @@ static int sdhci_at91_remove(struct platform_device *pdev) static struct platform_driver sdhci_at91_driver = { .driver = { .name = "sdhci-at91", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_at91_dt_match, .pm = &sdhci_at91_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 45881b309956..3a3340c9b8c9 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -1468,6 +1468,7 @@ static int sdhci_esdhc_probe(struct platform_device *pdev) static struct platform_driver sdhci_esdhc_driver = { .driver = { .name = "sdhci-esdhc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_esdhc_of_match, .pm = &esdhc_of_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci-of-hlwd.c b/drivers/mmc/host/sdhci-of-hlwd.c index da844a39af6e..12675797b296 100644 --- a/drivers/mmc/host/sdhci-of-hlwd.c +++ b/drivers/mmc/host/sdhci-of-hlwd.c @@ -80,6 +80,7 @@ MODULE_DEVICE_TABLE(of, sdhci_hlwd_of_match); static struct platform_driver sdhci_hlwd_driver = { .driver = { .name = "sdhci-hlwd", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_hlwd_of_match, .pm = &sdhci_pltfm_pmops, }, diff --git a/drivers/mmc/host/sdhci-pxav2.c b/drivers/mmc/host/sdhci-pxav2.c index 9282bc4b8c41..f18906b5575f 100644 --- a/drivers/mmc/host/sdhci-pxav2.c +++ b/drivers/mmc/host/sdhci-pxav2.c @@ -226,6 +226,7 @@ static int sdhci_pxav2_probe(struct platform_device *pdev) static struct platform_driver sdhci_pxav2_driver = { .driver = { .name = "sdhci-pxav2", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_pxav2_of_match), .pm = &sdhci_pltfm_pmops, }, diff --git a/drivers/mmc/host/sdhci-pxav3.c b/drivers/mmc/host/sdhci-pxav3.c index e55037ceda73..a6d89a3f1946 100644 --- a/drivers/mmc/host/sdhci-pxav3.c +++ b/drivers/mmc/host/sdhci-pxav3.c @@ -567,6 +567,7 @@ static const struct dev_pm_ops sdhci_pxav3_pmops = { static struct platform_driver sdhci_pxav3_driver = { .driver = { .name = "sdhci-pxav3", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_pxav3_of_match), .pm = &sdhci_pxav3_pmops, }, diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index 080ced1e63f0..cb5f87be7535 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -784,6 +784,7 @@ static struct platform_driver sdhci_s3c_driver = { .id_table = sdhci_s3c_driver_ids, .driver = { .name = "s3c-sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_s3c_dt_match), .pm = &sdhci_s3c_pmops, }, diff --git a/drivers/mmc/host/sdhci-sirf.c b/drivers/mmc/host/sdhci-sirf.c index f4b05dd6c20a..e9b347b3af7e 100644 --- a/drivers/mmc/host/sdhci-sirf.c +++ b/drivers/mmc/host/sdhci-sirf.c @@ -220,6 +220,7 @@ MODULE_DEVICE_TABLE(of, sdhci_sirf_of_match); static struct platform_driver sdhci_sirf_driver = { .driver = { .name = "sdhci-sirf", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_sirf_of_match, .pm = &sdhci_pltfm_pmops, }, diff --git a/drivers/mmc/host/sdhci-spear.c b/drivers/mmc/host/sdhci-spear.c index b4b63089a4e2..d463e2fd5b1a 100644 --- a/drivers/mmc/host/sdhci-spear.c +++ b/drivers/mmc/host/sdhci-spear.c @@ -181,6 +181,7 @@ MODULE_DEVICE_TABLE(of, sdhci_spear_id_table); static struct platform_driver sdhci_driver = { .driver = { .name = "sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pm_ops, .of_match_table = of_match_ptr(sdhci_spear_id_table), }, diff --git a/drivers/mmc/host/sdhci-st.c b/drivers/mmc/host/sdhci-st.c index 1301cebfc3ea..4e9ff3e828ba 100644 --- a/drivers/mmc/host/sdhci-st.c +++ b/drivers/mmc/host/sdhci-st.c @@ -521,6 +521,7 @@ static struct platform_driver sdhci_st_driver = { .remove = sdhci_st_remove, .driver = { .name = "sdhci-st", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_st_pmops, .of_match_table = of_match_ptr(st_sdhci_match), }, diff --git a/drivers/mmc/host/sdhci-tegra.c b/drivers/mmc/host/sdhci-tegra.c index 428cdc53c858..ed12aacb1c73 100644 --- a/drivers/mmc/host/sdhci-tegra.c +++ b/drivers/mmc/host/sdhci-tegra.c @@ -1782,6 +1782,7 @@ static SIMPLE_DEV_PM_OPS(sdhci_tegra_dev_pm_ops, sdhci_tegra_suspend, static struct platform_driver sdhci_tegra_driver = { .driver = { .name = "sdhci-tegra", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_tegra_dt_match, .pm = &sdhci_tegra_dev_pm_ops, }, diff --git a/drivers/mmc/host/sdhci_f_sdh30.c b/drivers/mmc/host/sdhci_f_sdh30.c index 4625cc071b61..3f5977979cf2 100644 --- a/drivers/mmc/host/sdhci_f_sdh30.c +++ b/drivers/mmc/host/sdhci_f_sdh30.c @@ -219,6 +219,7 @@ MODULE_DEVICE_TABLE(acpi, f_sdh30_acpi_ids); static struct platform_driver sdhci_f_sdh30_driver = { .driver = { .name = "f_sdh30", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(f_sdh30_dt_ids), .acpi_match_table = ACPI_PTR(f_sdh30_acpi_ids), .pm = &sdhci_pltfm_pmops, diff --git a/drivers/mmc/host/sh_mmcif.c b/drivers/mmc/host/sh_mmcif.c index 9f53634aa411..e5e457037235 100644 --- a/drivers/mmc/host/sh_mmcif.c +++ b/drivers/mmc/host/sh_mmcif.c @@ -1562,6 +1562,7 @@ static struct platform_driver sh_mmcif_driver = { .remove = sh_mmcif_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sh_mmcif_dev_pm_ops, .of_match_table = sh_mmcif_of_match, }, diff --git a/drivers/mmc/host/sunxi-mmc.c b/drivers/mmc/host/sunxi-mmc.c index 5e95bbc51644..fc62773602ec 100644 --- a/drivers/mmc/host/sunxi-mmc.c +++ b/drivers/mmc/host/sunxi-mmc.c @@ -1514,6 +1514,7 @@ static const struct dev_pm_ops sunxi_mmc_pm_ops = { static struct platform_driver sunxi_mmc_driver = { .driver = { .name = "sunxi-mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sunxi_mmc_of_match), .pm = &sunxi_mmc_pm_ops, }, diff --git a/drivers/mmc/host/tmio_mmc.c b/drivers/mmc/host/tmio_mmc.c index 757c90160ae4..d2d3b8df1bbe 100644 --- a/drivers/mmc/host/tmio_mmc.c +++ b/drivers/mmc/host/tmio_mmc.c @@ -213,6 +213,7 @@ static const struct dev_pm_ops tmio_mmc_dev_pm_ops = { static struct platform_driver tmio_mmc_driver = { .driver = { .name = "tmio-mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &tmio_mmc_dev_pm_ops, }, .probe = tmio_mmc_probe, diff --git a/drivers/mmc/host/usdhi6rol0.c b/drivers/mmc/host/usdhi6rol0.c index 7666c90054ae..e2d5112d809d 100644 --- a/drivers/mmc/host/usdhi6rol0.c +++ b/drivers/mmc/host/usdhi6rol0.c @@ -1890,6 +1890,7 @@ static struct platform_driver usdhi6_driver = { .remove = usdhi6_remove, .driver = { .name = "usdhi6rol0", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = usdhi6_of_match, }, }; diff --git a/drivers/mmc/host/wbsd.c b/drivers/mmc/host/wbsd.c index 67f917d6ecd3..cd63ea865b77 100644 --- a/drivers/mmc/host/wbsd.c +++ b/drivers/mmc/host/wbsd.c @@ -1905,6 +1905,7 @@ static struct platform_driver wbsd_driver = { .resume = wbsd_platform_resume, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, }; diff --git a/drivers/mmc/host/wmt-sdmmc.c b/drivers/mmc/host/wmt-sdmmc.c index 2c4ba1fa4bbf..cf10949fb0ac 100644 --- a/drivers/mmc/host/wmt-sdmmc.c +++ b/drivers/mmc/host/wmt-sdmmc.c @@ -990,6 +990,7 @@ static struct platform_driver wmt_mci_driver = { .remove = wmt_mci_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = wmt_mci_pm_ops, .of_match_table = wmt_mci_dt_ids, }, From 2a99f3fa85eaa79c332945a256ff63aad1fd0b9c Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 3 Sep 2020 16:24:37 -0700 Subject: [PATCH 069/271] mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v4.9 This is like commit 3d3451124f3d ("mmc: sdhci-msm: Prefer asynchronous probe") but applied to a whole pile of drivers. This batch converts the drivers that appeared to be around in the v4.9 timeframe. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200903162412.2.I226782b43191ce367fa3bc1c907c29f571890412@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-brcmstb.c | 1 + drivers/mmc/host/sdhci-pic32.c | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/mmc/host/sdhci-brcmstb.c b/drivers/mmc/host/sdhci-brcmstb.c index 876668827580..bbf3496f4495 100644 --- a/drivers/mmc/host/sdhci-brcmstb.c +++ b/drivers/mmc/host/sdhci-brcmstb.c @@ -326,6 +326,7 @@ MODULE_DEVICE_TABLE(of, sdhci_brcm_of_match); static struct platform_driver sdhci_brcmstb_driver = { .driver = { .name = "sdhci-brcmstb", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = of_match_ptr(sdhci_brcm_of_match), }, diff --git a/drivers/mmc/host/sdhci-pic32.c b/drivers/mmc/host/sdhci-pic32.c index a11e6397d4ff..6ce1519ae177 100644 --- a/drivers/mmc/host/sdhci-pic32.c +++ b/drivers/mmc/host/sdhci-pic32.c @@ -241,6 +241,7 @@ MODULE_DEVICE_TABLE(of, pic32_sdhci_id_table); static struct platform_driver pic32_sdhci_driver = { .driver = { .name = "pic32-sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(pic32_sdhci_id_table), }, .probe = pic32_sdhci_probe, From 7320915c88618050a2f65a193f0e25cccb2007b6 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 3 Sep 2020 16:24:38 -0700 Subject: [PATCH 070/271] mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v4.14 This is like commit 3d3451124f3d ("mmc: sdhci-msm: Prefer asynchronous probe") but applied to a whole pile of drivers. This batch converts the drivers that appeared to be around in the v4.14 timeframe. Signed-off-by: Douglas Anderson Reviewed-by: Wolfram Sang Tested-by: Wolfram Sang # SDHI drivers Link: https://lore.kernel.org/r/20200903162412.3.Id1ff21470f08f427aedd0a6535dcd83ccc56b278@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/bcm2835.c | 1 + drivers/mmc/host/cavium-octeon.c | 1 + drivers/mmc/host/dw_mmc-zx.c | 1 + drivers/mmc/host/meson-gx-mmc.c | 1 + drivers/mmc/host/renesas_sdhi_internal_dmac.c | 1 + drivers/mmc/host/renesas_sdhi_sys_dmac.c | 1 + drivers/mmc/host/sdhci-cadence.c | 1 + drivers/mmc/host/sdhci-xenon.c | 1 + 8 files changed, 8 insertions(+) diff --git a/drivers/mmc/host/bcm2835.c b/drivers/mmc/host/bcm2835.c index 35320bc9dc02..8c2361e66277 100644 --- a/drivers/mmc/host/bcm2835.c +++ b/drivers/mmc/host/bcm2835.c @@ -1474,6 +1474,7 @@ static struct platform_driver bcm2835_driver = { .remove = bcm2835_remove, .driver = { .name = "sdhost-bcm2835", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = bcm2835_match, }, }; diff --git a/drivers/mmc/host/cavium-octeon.c b/drivers/mmc/host/cavium-octeon.c index e299cdd1e619..2c4b2df52adb 100644 --- a/drivers/mmc/host/cavium-octeon.c +++ b/drivers/mmc/host/cavium-octeon.c @@ -327,6 +327,7 @@ static struct platform_driver octeon_mmc_driver = { .remove = octeon_mmc_remove, .driver = { .name = KBUILD_MODNAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = octeon_mmc_match, }, }; diff --git a/drivers/mmc/host/dw_mmc-zx.c b/drivers/mmc/host/dw_mmc-zx.c index d9e483432a61..51bcc6332f3a 100644 --- a/drivers/mmc/host/dw_mmc-zx.c +++ b/drivers/mmc/host/dw_mmc-zx.c @@ -222,6 +222,7 @@ static struct platform_driver dw_mci_zx_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dwmmc_zx", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_zx_match, .pm = &dw_mci_zx_dev_pm_ops, }, diff --git a/drivers/mmc/host/meson-gx-mmc.c b/drivers/mmc/host/meson-gx-mmc.c index a68b43082f61..4ec41579940a 100644 --- a/drivers/mmc/host/meson-gx-mmc.c +++ b/drivers/mmc/host/meson-gx-mmc.c @@ -1264,6 +1264,7 @@ static struct platform_driver meson_mmc_driver = { .remove = meson_mmc_remove, .driver = { .name = DRIVER_NAME, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(meson_mmc_of_match), }, }; diff --git a/drivers/mmc/host/renesas_sdhi_internal_dmac.c b/drivers/mmc/host/renesas_sdhi_internal_dmac.c index 8ee6b2b85ec2..fe13e1ea22dc 100644 --- a/drivers/mmc/host/renesas_sdhi_internal_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_internal_dmac.c @@ -353,6 +353,7 @@ static const struct dev_pm_ops renesas_sdhi_internal_dmac_dev_pm_ops = { static struct platform_driver renesas_internal_dmac_sdhi_driver = { .driver = { .name = "renesas_sdhi_internal_dmac", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &renesas_sdhi_internal_dmac_dev_pm_ops, .of_match_table = renesas_sdhi_internal_dmac_of_match, }, diff --git a/drivers/mmc/host/renesas_sdhi_sys_dmac.c b/drivers/mmc/host/renesas_sdhi_sys_dmac.c index 13ff023fbee9..c5f789675302 100644 --- a/drivers/mmc/host/renesas_sdhi_sys_dmac.c +++ b/drivers/mmc/host/renesas_sdhi_sys_dmac.c @@ -463,6 +463,7 @@ static const struct dev_pm_ops renesas_sdhi_sys_dmac_dev_pm_ops = { static struct platform_driver renesas_sys_dmac_sdhi_driver = { .driver = { .name = "sh_mobile_sdhi", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &renesas_sdhi_sys_dmac_dev_pm_ops, .of_match_table = renesas_sdhi_sys_dmac_of_match, }, diff --git a/drivers/mmc/host/sdhci-cadence.c b/drivers/mmc/host/sdhci-cadence.c index 4d9f7681817c..6f2de54a5987 100644 --- a/drivers/mmc/host/sdhci-cadence.c +++ b/drivers/mmc/host/sdhci-cadence.c @@ -463,6 +463,7 @@ MODULE_DEVICE_TABLE(of, sdhci_cdns_match); static struct platform_driver sdhci_cdns_driver = { .driver = { .name = "sdhci-cdns", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_cdns_pm_ops, .of_match_table = sdhci_cdns_match, }, diff --git a/drivers/mmc/host/sdhci-xenon.c b/drivers/mmc/host/sdhci-xenon.c index 4703cd540c7f..24c978de2a3f 100644 --- a/drivers/mmc/host/sdhci-xenon.c +++ b/drivers/mmc/host/sdhci-xenon.c @@ -677,6 +677,7 @@ MODULE_DEVICE_TABLE(of, sdhci_xenon_dt_ids); static struct platform_driver sdhci_xenon_driver = { .driver = { .name = "xenon-sdhci", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_xenon_dt_ids, .pm = &sdhci_xenon_dev_pm_ops, }, From a1a489197a07ec1dbf3f6033b84755a74efc65dc Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 3 Sep 2020 16:24:39 -0700 Subject: [PATCH 071/271] mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v4.19 This is like commit 3d3451124f3d ("mmc: sdhci-msm: Prefer asynchronous probe") but applied to a whole pile of drivers. This batch converts the drivers that appeared to be around in the v4.19 timeframe. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200903162412.4.I84eb3e0a738635d524c90d1a688087bc295f7c32@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/dw_mmc-bluefield.c | 1 + drivers/mmc/host/dw_mmc-hi3798cv200.c | 1 + drivers/mmc/host/meson-mx-sdio.c | 1 + drivers/mmc/host/sdhci-of-dwcmshc.c | 1 + drivers/mmc/host/sdhci-omap.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/mmc/host/dw_mmc-bluefield.c b/drivers/mmc/host/dw_mmc-bluefield.c index aa38b1a8017e..10baf122bc15 100644 --- a/drivers/mmc/host/dw_mmc-bluefield.c +++ b/drivers/mmc/host/dw_mmc-bluefield.c @@ -55,6 +55,7 @@ static struct platform_driver dw_mci_bluefield_pltfm_driver = { .remove = dw_mci_pltfm_remove, .driver = { .name = "dwmmc_bluefield", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_bluefield_match, .pm = &dw_mci_pltfm_pmops, }, diff --git a/drivers/mmc/host/dw_mmc-hi3798cv200.c b/drivers/mmc/host/dw_mmc-hi3798cv200.c index 83e1bad0a008..39794f93826f 100644 --- a/drivers/mmc/host/dw_mmc-hi3798cv200.c +++ b/drivers/mmc/host/dw_mmc-hi3798cv200.c @@ -200,6 +200,7 @@ static struct platform_driver dw_mci_hi3798cv200_driver = { .remove = dw_mci_hi3798cv200_remove, .driver = { .name = "dwmmc_hi3798cv200", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = dw_mci_hi3798cv200_match, }, }; diff --git a/drivers/mmc/host/meson-mx-sdio.c b/drivers/mmc/host/meson-mx-sdio.c index 703d5834f9a5..1c5299cd0cbe 100644 --- a/drivers/mmc/host/meson-mx-sdio.c +++ b/drivers/mmc/host/meson-mx-sdio.c @@ -755,6 +755,7 @@ static struct platform_driver meson_mx_mmc_driver = { .remove = meson_mx_mmc_remove, .driver = { .name = "meson-mx-sdio", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(meson_mx_mmc_of_match), }, }; diff --git a/drivers/mmc/host/sdhci-of-dwcmshc.c b/drivers/mmc/host/sdhci-of-dwcmshc.c index 64ac0dbee95c..4b673792b5a4 100644 --- a/drivers/mmc/host/sdhci-of-dwcmshc.c +++ b/drivers/mmc/host/sdhci-of-dwcmshc.c @@ -214,6 +214,7 @@ MODULE_DEVICE_TABLE(of, sdhci_dwcmshc_dt_ids); static struct platform_driver sdhci_dwcmshc_driver = { .driver = { .name = "sdhci-dwcmshc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_dwcmshc_dt_ids, .pm = &dwcmshc_pmops, }, diff --git a/drivers/mmc/host/sdhci-omap.c b/drivers/mmc/host/sdhci-omap.c index 1ec74c2d5c17..7893fd3599b6 100644 --- a/drivers/mmc/host/sdhci-omap.c +++ b/drivers/mmc/host/sdhci-omap.c @@ -1297,6 +1297,7 @@ static struct platform_driver sdhci_omap_driver = { .remove = sdhci_omap_remove, .driver = { .name = "sdhci-omap", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_omap_dev_pm_ops, .of_match_table = omap_sdhci_match, }, From d86472ae8b20805449c42467417cfc1ff579a76b Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 3 Sep 2020 16:24:40 -0700 Subject: [PATCH 072/271] mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that existed in v5.4 This is like commit 3d3451124f3d ("mmc: sdhci-msm: Prefer asynchronous probe") but applied to a whole pile of drivers. This batch converts the drivers that appeared to be around in the v5.4 timeframe. Signed-off-by: Douglas Anderson Link: https://lore.kernel.org/r/20200903162412.5.I2b630c4d40ff4ea61d5b30b8ccfe95890e257100@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/alcor.c | 1 + drivers/mmc/host/sdhci-of-aspeed.c | 2 ++ drivers/mmc/host/sdhci-sprd.c | 1 + drivers/mmc/host/sdhci_am654.c | 1 + drivers/mmc/host/uniphier-sd.c | 1 + 5 files changed, 6 insertions(+) diff --git a/drivers/mmc/host/alcor.c b/drivers/mmc/host/alcor.c index 026ca9194ce5..bfb8efeb7eb8 100644 --- a/drivers/mmc/host/alcor.c +++ b/drivers/mmc/host/alcor.c @@ -1178,6 +1178,7 @@ static struct platform_driver alcor_pci_sdmmc_driver = { .id_table = alcor_pci_sdmmc_ids, .driver = { .name = DRV_NAME_ALCOR_PCI_SDMMC, + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &alcor_mmc_pm_ops }, }; diff --git a/drivers/mmc/host/sdhci-of-aspeed.c b/drivers/mmc/host/sdhci-of-aspeed.c index a1bcc0f4ba9e..4f008ba3280e 100644 --- a/drivers/mmc/host/sdhci-of-aspeed.c +++ b/drivers/mmc/host/sdhci-of-aspeed.c @@ -240,6 +240,7 @@ static const struct of_device_id aspeed_sdhci_of_match[] = { static struct platform_driver aspeed_sdhci_driver = { .driver = { .name = "sdhci-aspeed", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = aspeed_sdhci_of_match, }, .probe = aspeed_sdhci_probe, @@ -318,6 +319,7 @@ MODULE_DEVICE_TABLE(of, aspeed_sdc_of_match); static struct platform_driver aspeed_sdc_driver = { .driver = { .name = "sd-controller-aspeed", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .pm = &sdhci_pltfm_pmops, .of_match_table = aspeed_sdc_of_match, }, diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index bafa2e41c8b6..1efaf602c206 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -787,6 +787,7 @@ static struct platform_driver sdhci_sprd_driver = { .remove = sdhci_sprd_remove, .driver = { .name = "sdhci_sprd_r11", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(sdhci_sprd_of_match), .pm = &sdhci_sprd_pm_ops, }, diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index c5f47197d177..a4c6d9d80e88 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -745,6 +745,7 @@ static int sdhci_am654_remove(struct platform_device *pdev) static struct platform_driver sdhci_am654_driver = { .driver = { .name = "sdhci-am654", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_am654_of_match, }, .probe = sdhci_am654_probe, diff --git a/drivers/mmc/host/uniphier-sd.c b/drivers/mmc/host/uniphier-sd.c index 55efd5c53249..3092466a99ab 100644 --- a/drivers/mmc/host/uniphier-sd.c +++ b/drivers/mmc/host/uniphier-sd.c @@ -685,6 +685,7 @@ static struct platform_driver uniphier_sd_driver = { .remove = uniphier_sd_remove, .driver = { .name = "uniphier-sd", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = uniphier_sd_match, }, }; From 31ae403513be3aa464c28a5544ecb2beb20f32c6 Mon Sep 17 00:00:00 2001 From: Douglas Anderson Date: Thu, 3 Sep 2020 16:24:41 -0700 Subject: [PATCH 073/271] mmc: Set PROBE_PREFER_ASYNCHRONOUS for drivers that are newer than 5.4 This is like commit 3d3451124f3d ("mmc: sdhci-msm: Prefer asynchronous probe") but applied to a whole pile of drivers. This batch converts the drivers that appeared to have been added after kernel 5.4. Signed-off-by: Douglas Anderson Acked-by: Lars Povlsen Acked-by: Angelo Dureghello Acked-by: Manivannan Sadhasivam Link: https://lore.kernel.org/r/20200903162412.6.Ib121debfb18e5f923a3cd38fe9c36aa086c650c5@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/meson-mx-sdhc-mmc.c | 1 + drivers/mmc/host/owl-mmc.c | 1 + drivers/mmc/host/sdhci-esdhc-mcf.c | 1 + drivers/mmc/host/sdhci-milbeaut.c | 1 + drivers/mmc/host/sdhci-of-sparx5.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/mmc/host/meson-mx-sdhc-mmc.c b/drivers/mmc/host/meson-mx-sdhc-mmc.c index 53e3f6a4245a..7cd9c0ec2fcf 100644 --- a/drivers/mmc/host/meson-mx-sdhc-mmc.c +++ b/drivers/mmc/host/meson-mx-sdhc-mmc.c @@ -903,6 +903,7 @@ static struct platform_driver meson_mx_sdhc_driver = { .remove = meson_mx_sdhc_remove, .driver = { .name = "meson-mx-sdhc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(meson_mx_sdhc_of_match), }, }; diff --git a/drivers/mmc/host/owl-mmc.c b/drivers/mmc/host/owl-mmc.c index df43f42855e2..ccf214a89eda 100644 --- a/drivers/mmc/host/owl-mmc.c +++ b/drivers/mmc/host/owl-mmc.c @@ -689,6 +689,7 @@ MODULE_DEVICE_TABLE(of, owl_mmc_of_match); static struct platform_driver owl_mmc_driver = { .driver = { .name = "owl_mmc", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = owl_mmc_of_match, }, .probe = owl_mmc_probe, diff --git a/drivers/mmc/host/sdhci-esdhc-mcf.c b/drivers/mmc/host/sdhci-esdhc-mcf.c index 71bf086a9812..ca7a1690b2a8 100644 --- a/drivers/mmc/host/sdhci-esdhc-mcf.c +++ b/drivers/mmc/host/sdhci-esdhc-mcf.c @@ -509,6 +509,7 @@ static int sdhci_esdhc_mcf_remove(struct platform_device *pdev) static struct platform_driver sdhci_esdhc_mcf_driver = { .driver = { .name = "sdhci-esdhc-mcf", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, }, .probe = sdhci_esdhc_mcf_probe, .remove = sdhci_esdhc_mcf_remove, diff --git a/drivers/mmc/host/sdhci-milbeaut.c b/drivers/mmc/host/sdhci-milbeaut.c index 4e7cc0680f94..148b37ac6564 100644 --- a/drivers/mmc/host/sdhci-milbeaut.c +++ b/drivers/mmc/host/sdhci-milbeaut.c @@ -333,6 +333,7 @@ static int sdhci_milbeaut_remove(struct platform_device *pdev) static struct platform_driver sdhci_milbeaut_driver = { .driver = { .name = "sdhci-milbeaut", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = of_match_ptr(mlb_dt_ids), }, .probe = sdhci_milbeaut_probe, diff --git a/drivers/mmc/host/sdhci-of-sparx5.c b/drivers/mmc/host/sdhci-of-sparx5.c index 747f108a0ace..28e4ee69e100 100644 --- a/drivers/mmc/host/sdhci-of-sparx5.c +++ b/drivers/mmc/host/sdhci-of-sparx5.c @@ -255,6 +255,7 @@ MODULE_DEVICE_TABLE(of, sdhci_sparx5_of_match); static struct platform_driver sdhci_sparx5_driver = { .driver = { .name = "sdhci-sparx5", + .probe_type = PROBE_PREFER_ASYNCHRONOUS, .of_match_table = sdhci_sparx5_of_match, .pm = &sdhci_pltfm_pmops, }, From 461aea72af0e3aefff0c428f73c75e6b9ce9fcb5 Mon Sep 17 00:00:00 2001 From: Ulf Hansson Date: Tue, 8 Sep 2020 16:02:42 +0200 Subject: [PATCH 074/271] mmc: Drop COMPILE_TEST Kconfig option for MMC_S3C MMC_S3C isn't ready yet to be built with COMPILE_TEST, hence drop it. Signed-off-by: Ulf Hansson Link: https://lore.kernel.org/r/20200908140242.743234-1-ulf.hansson@linaro.org Reviewed-by: Krzysztof Kozlowski --- drivers/mmc/host/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index dc646359b4ff..185a2564e623 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -630,7 +630,7 @@ config MMC_SPI config MMC_S3C tristate "Samsung S3C SD/MMC Card Interface support" - depends on ARCH_S3C24XX || COMPILE_TEST + depends on ARCH_S3C24XX depends on S3C24XX_DMAC help This selects a driver for the MCI interface found in From 354f47b699ae209606dcd0b05200cd7f17059e38 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 2 Sep 2020 10:18:11 +0200 Subject: [PATCH 075/271] mmc: tmio: add generic hook to fixup after a completed request Sadly, due to HW bugs, we need a callback to work around issues just before completing the request. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200902081812.1591-2-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/tmio_mmc.h | 1 + drivers/mmc/host/tmio_mmc_core.c | 3 +++ 2 files changed, 4 insertions(+) diff --git a/drivers/mmc/host/tmio_mmc.h b/drivers/mmc/host/tmio_mmc.h index 51b5f388f6d8..9546e542619c 100644 --- a/drivers/mmc/host/tmio_mmc.h +++ b/drivers/mmc/host/tmio_mmc.h @@ -179,6 +179,7 @@ struct tmio_mmc_host { int (*write16_hook)(struct tmio_mmc_host *host, int addr); void (*reset)(struct tmio_mmc_host *host); bool (*check_retune)(struct tmio_mmc_host *host); + void (*fixup_request)(struct tmio_mmc_host *host, struct mmc_request *mrq); void (*prepare_hs400_tuning)(struct tmio_mmc_host *host); void (*hs400_downgrade)(struct tmio_mmc_host *host); diff --git a/drivers/mmc/host/tmio_mmc_core.c b/drivers/mmc/host/tmio_mmc_core.c index 0f266cbf82b8..2fce0518632d 100644 --- a/drivers/mmc/host/tmio_mmc_core.c +++ b/drivers/mmc/host/tmio_mmc_core.c @@ -809,6 +809,9 @@ static void tmio_mmc_finish_request(struct tmio_mmc_host *host) return; } + if (host->fixup_request) + host->fixup_request(host, mrq); + mmc_request_done(host->mmc, mrq); } From ce6f92c2801e90650106b00666aa37560978e324 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Wed, 2 Sep 2020 10:18:12 +0200 Subject: [PATCH 076/271] mmc: renesas_sdhi: support manual calibration Some R-Car Gen3 SoCs need some manual correction of timing parameters after the automatic tuning has finished but before next CMD13 is completed. This patch implements that by this state machine: - introducing a per-SoC correction table if needed - iff such a table exists, the 'fixup_request' callback is populated during probe - iff such a table exists, a runtime flag ('needs_adjust_hs400') is set when HS400 tuning was completed - the callback will check the runtime flag and enable the corrected manual mode if the flag is set and CMD13 is encountered - at the end of the enablement the runtime flag is cleared - iff the configuration flag is set, the manual mode will be disabled when HS400 gets downgraded There also some helper functions added to access the TMPPORT registers. The actual correction table is SoC and instance(!) specific and is added to the quirks struct. Signed-off-by: Takeshi Saito Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200902081812.1591-3-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi.h | 5 + drivers/mmc/host/renesas_sdhi_core.c | 152 ++++++++++++++++++++++++++- 2 files changed, 155 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi.h b/drivers/mmc/host/renesas_sdhi.h index 14c64caefc64..24958de274c1 100644 --- a/drivers/mmc/host/renesas_sdhi.h +++ b/drivers/mmc/host/renesas_sdhi.h @@ -33,10 +33,13 @@ struct renesas_sdhi_of_data { unsigned short max_segs; }; +#define SDHI_CALIB_TABLE_MAX 32 + struct renesas_sdhi_quirks { bool hs400_disabled; bool hs400_4taps; u32 hs400_bad_taps; + const u8 (*hs400_calib_table)[SDHI_CALIB_TABLE_MAX]; }; struct tmio_mmc_dma { @@ -58,6 +61,8 @@ struct renesas_sdhi { void __iomem *scc_ctl; u32 scc_tappos; u32 scc_tappos_hs400; + const u8 *adjust_hs400_calib_table; + bool needs_adjust_hs400; bool doing_tune; /* Tuning values: 1 for success, 0 for failure */ diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 42eaec40a6d9..d621a4af8e87 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -47,6 +48,8 @@ #define SDHI_VER_GEN3_SD 0xcc10 #define SDHI_VER_GEN3_SDMMC 0xcd10 +#define SDHI_GEN3_MMC0_ADDR 0xee140000 + static void renesas_sdhi_sdbuf_width(struct tmio_mmc_host *host, int width) { u32 val; @@ -251,6 +254,11 @@ static int renesas_sdhi_start_signal_voltage_switch(struct mmc_host *mmc, #define SH_MOBILE_SDHI_SCC_RVSREQ 0x00A #define SH_MOBILE_SDHI_SCC_SMPCMP 0x00C #define SH_MOBILE_SDHI_SCC_TMPPORT2 0x00E +#define SH_MOBILE_SDHI_SCC_TMPPORT3 0x014 +#define SH_MOBILE_SDHI_SCC_TMPPORT4 0x016 +#define SH_MOBILE_SDHI_SCC_TMPPORT5 0x018 +#define SH_MOBILE_SDHI_SCC_TMPPORT6 0x01A +#define SH_MOBILE_SDHI_SCC_TMPPORT7 0x01C #define SH_MOBILE_SDHI_SCC_DTCNTL_TAPEN BIT(0) #define SH_MOBILE_SDHI_SCC_DTCNTL_TAPNUM_SHIFT 16 @@ -271,6 +279,40 @@ static int renesas_sdhi_start_signal_voltage_switch(struct mmc_host *mmc, #define SH_MOBILE_SDHI_SCC_TMPPORT2_HS400OSEL BIT(4) #define SH_MOBILE_SDHI_SCC_TMPPORT2_HS400EN BIT(31) +/* Definitions for values the SH_MOBILE_SDHI_SCC_TMPPORT4 register */ +#define SH_MOBILE_SDHI_SCC_TMPPORT4_DLL_ACC_START BIT(0) + +/* Definitions for values the SH_MOBILE_SDHI_SCC_TMPPORT5 register */ +#define SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_R BIT(8) +#define SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_W (0 << 8) +#define SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_ADR_MASK 0x3F + +/* Definitions for values the SH_MOBILE_SDHI_SCC register */ +#define SH_MOBILE_SDHI_SCC_TMPPORT_DISABLE_WP_CODE 0xa5000000 +#define SH_MOBILE_SDHI_SCC_TMPPORT_CALIB_CODE_MASK 0x1f +#define SH_MOBILE_SDHI_SCC_TMPPORT_MANUAL_MODE BIT(7) + +static const u8 r8a7796_es13_calib_table[2][SDHI_CALIB_TABLE_MAX] = { + { 3, 3, 3, 3, 3, 3, 3, 4, 4, 5, 6, 7, 8, 9, 10, 15, + 16, 16, 16, 16, 16, 16, 17, 18, 18, 19, 20, 21, 22, 23, 24, 25 }, + { 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 6, 7, 8, 11, + 12, 17, 18, 18, 18, 18, 18, 18, 18, 19, 20, 21, 22, 23, 25, 25 } +}; + +static const u8 r8a77965_calib_table[2][SDHI_CALIB_TABLE_MAX] = { + { 1, 2, 6, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 15, 15, 16, + 17, 18, 19, 20, 21, 22, 23, 24, 25, 25, 26, 27, 28, 29, 30, 31 }, + { 2, 3, 4, 4, 5, 6, 7, 9, 10, 11, 12, 13, 14, 15, 16, 17, + 17, 17, 20, 21, 22, 23, 24, 25, 27, 28, 29, 30, 31, 31, 31, 31 } +}; + +static const u8 r8a77990_calib_table[2][SDHI_CALIB_TABLE_MAX] = { + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 1, 2, 3, 3, 4, 4, 4, 5, 5, 6, 8, 9, 10, + 11, 12, 13, 15, 16, 17, 17, 18, 18, 19, 20, 22, 24, 25, 26, 26 } +}; + static inline u32 sd_scc_read32(struct tmio_mmc_host *host, struct renesas_sdhi *priv, int addr) { @@ -377,6 +419,9 @@ static void renesas_sdhi_hs400_complete(struct mmc_host *mmc) sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); + + if (priv->adjust_hs400_calib_table) + priv->needs_adjust_hs400 = true; } static void renesas_sdhi_reset_scc(struct tmio_mmc_host *host, @@ -407,6 +452,74 @@ static void renesas_sdhi_disable_scc(struct mmc_host *mmc) sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); } +static u32 sd_scc_tmpport_read32(struct tmio_mmc_host *host, + struct renesas_sdhi *priv, u32 addr) +{ + /* read mode */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT5, + SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_R | + (SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_ADR_MASK & addr)); + + /* access start and stop */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, + SH_MOBILE_SDHI_SCC_TMPPORT4_DLL_ACC_START); + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, 0); + + return sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT7); +} + +static void sd_scc_tmpport_write32(struct tmio_mmc_host *host, + struct renesas_sdhi *priv, u32 addr, u32 val) +{ + /* write mode */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT5, + SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_RW_SEL_W | + (SH_MOBILE_SDHI_SCC_TMPPORT5_DLL_ADR_MASK & addr)); + + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT6, val); + + /* access start and stop */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, + SH_MOBILE_SDHI_SCC_TMPPORT4_DLL_ACC_START); + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT4, 0); +} + +static void renesas_sdhi_adjust_hs400_mode_enable(struct tmio_mmc_host *host) +{ + struct renesas_sdhi *priv = host_to_priv(host); + u32 calib_code; + + /* disable write protect */ + sd_scc_tmpport_write32(host, priv, 0x00, + SH_MOBILE_SDHI_SCC_TMPPORT_DISABLE_WP_CODE); + /* read calibration code and adjust */ + calib_code = sd_scc_tmpport_read32(host, priv, 0x26); + calib_code &= SH_MOBILE_SDHI_SCC_TMPPORT_CALIB_CODE_MASK; + + sd_scc_tmpport_write32(host, priv, 0x22, + SH_MOBILE_SDHI_SCC_TMPPORT_MANUAL_MODE | + priv->adjust_hs400_calib_table[calib_code]); + + /* set offset value to TMPPORT3, hardcoded to OFFSET0 (= 0x3) for now */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT3, 0x3); + + /* adjustment done, clear flag */ + priv->needs_adjust_hs400 = false; +} + +static void renesas_sdhi_adjust_hs400_mode_disable(struct tmio_mmc_host *host) +{ + struct renesas_sdhi *priv = host_to_priv(host); + + /* disable write protect */ + sd_scc_tmpport_write32(host, priv, 0x00, + SH_MOBILE_SDHI_SCC_TMPPORT_DISABLE_WP_CODE); + /* disable manual calibration */ + sd_scc_tmpport_write32(host, priv, 0x22, 0); + /* clear offset value of TMPPORT3 */ + sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT3, 0); +} + static void renesas_sdhi_reset_hs400_mode(struct tmio_mmc_host *host, struct renesas_sdhi *priv) { @@ -424,6 +537,9 @@ static void renesas_sdhi_reset_hs400_mode(struct tmio_mmc_host *host, SH_MOBILE_SDHI_SCC_TMPPORT2_HS400OSEL) & sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_TMPPORT2)); + if (priv->adjust_hs400_calib_table) + renesas_sdhi_adjust_hs400_mode_disable(host); + sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); } @@ -442,6 +558,7 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) renesas_sdhi_reset_scc(host, priv); renesas_sdhi_reset_hs400_mode(host, priv); + priv->needs_adjust_hs400 = false; sd_ctrl_write16(host, CTL_SD_CARD_CLK_CTL, CLK_CTL_SCLKEN | sd_ctrl_read16(host, CTL_SD_CARD_CLK_CTL)); @@ -717,6 +834,13 @@ static int renesas_sdhi_multi_io_quirk(struct mmc_card *card, return blk_size; } +static void renesas_sdhi_fixup_request(struct tmio_mmc_host *host, struct mmc_request *mrq) +{ + struct renesas_sdhi *priv = host_to_priv(host); + + if (priv->needs_adjust_hs400 && mrq->cmd->opcode == MMC_SEND_STATUS) + renesas_sdhi_adjust_hs400_mode_enable(host); +} static void renesas_sdhi_enable_dma(struct tmio_mmc_host *host, bool enable) { /* Iff regs are 8 byte apart, sdbuf is 64 bit. Otherwise always 32. */ @@ -748,6 +872,21 @@ static const struct renesas_sdhi_quirks sdhi_quirks_bad_taps2367 = { .hs400_bad_taps = BIT(2) | BIT(3) | BIT(6) | BIT(7), }; +static const struct renesas_sdhi_quirks sdhi_quirks_r8a7796_es13 = { + .hs400_4taps = true, + .hs400_bad_taps = BIT(2) | BIT(3) | BIT(6) | BIT(7), + .hs400_calib_table = r8a7796_es13_calib_table, +}; + +static const struct renesas_sdhi_quirks sdhi_quirks_r8a77965 = { + .hs400_bad_taps = BIT(2) | BIT(3) | BIT(6) | BIT(7), + .hs400_calib_table = r8a77965_calib_table, +}; + +static const struct renesas_sdhi_quirks sdhi_quirks_r8a77990 = { + .hs400_calib_table = r8a77990_calib_table, +}; + /* * Note for r8a7796 / r8a774a1: we can't distinguish ES1.1 and 1.2 as of now. * So, we want to treat them equally and only have a match for ES1.2 to enforce @@ -759,10 +898,11 @@ static const struct soc_device_attribute sdhi_quirks_match[] = { { .soc_id = "r8a7795", .revision = "ES2.0", .data = &sdhi_quirks_4tap }, { .soc_id = "r8a7795", .revision = "ES3.*", .data = &sdhi_quirks_bad_taps2367 }, { .soc_id = "r8a7796", .revision = "ES1.[012]", .data = &sdhi_quirks_4tap_nohs400 }, - { .soc_id = "r8a7796", .revision = "ES1.*", .data = &sdhi_quirks_4tap }, + { .soc_id = "r8a7796", .revision = "ES1.*", .data = &sdhi_quirks_r8a7796_es13 }, { .soc_id = "r8a7796", .revision = "ES3.*", .data = &sdhi_quirks_bad_taps1357 }, - { .soc_id = "r8a77965", .data = &sdhi_quirks_bad_taps2367 }, + { .soc_id = "r8a77965", .data = &sdhi_quirks_r8a77965 }, { .soc_id = "r8a77980", .data = &sdhi_quirks_nohs400 }, + { .soc_id = "r8a77990", .data = &sdhi_quirks_r8a77990 }, { /* Sentinel. */ }, }; @@ -919,6 +1059,14 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (ver == SDHI_VER_GEN2_SDR50) mmc_data->flags &= ~TMIO_MMC_HAVE_CBSY; + if (ver == SDHI_VER_GEN3_SDMMC && quirks && quirks->hs400_calib_table) { + host->fixup_request = renesas_sdhi_fixup_request; + priv->adjust_hs400_calib_table = *( + res->start == SDHI_GEN3_MMC0_ADDR ? + quirks->hs400_calib_table : + quirks->hs400_calib_table + 1); + } + ret = tmio_mmc_host_probe(host); if (ret < 0) goto edisclk; From 91ca244bdcb6a0b6ad12c8325e5e9228b9472374 Mon Sep 17 00:00:00 2001 From: Viresh Kumar Date: Wed, 9 Sep 2020 19:41:53 +0530 Subject: [PATCH 077/271] mmc: sdhci-msm: Unconditionally call dev_pm_opp_of_remove_table() dev_pm_opp_of_remove_table() doesn't report any errors when it fails to find the OPP table with error -ENODEV (i.e. OPP table not present for the device). And we can call dev_pm_opp_of_remove_table() unconditionally here. Signed-off-by: Viresh Kumar Link: https://lore.kernel.org/r/890ae5601594fca5de104695a682f4b6efbc631b.1599660554.git.viresh.kumar@linaro.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-msm.c | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/drivers/mmc/host/sdhci-msm.c b/drivers/mmc/host/sdhci-msm.c index 23dacab1a399..3451eb325513 100644 --- a/drivers/mmc/host/sdhci-msm.c +++ b/drivers/mmc/host/sdhci-msm.c @@ -263,7 +263,6 @@ struct sdhci_msm_host { unsigned long clk_rate; struct mmc_host *mmc; struct opp_table *opp_table; - bool has_opp_table; bool use_14lpp_dll_reset; bool tuning_done; bool calibration_done; @@ -2302,11 +2301,9 @@ static int sdhci_msm_probe(struct platform_device *pdev) /* OPP table is optional */ ret = dev_pm_opp_of_add_table(&pdev->dev); - if (!ret) { - msm_host->has_opp_table = true; - } else if (ret != -ENODEV) { + if (ret && ret != -ENODEV) { dev_err(&pdev->dev, "Invalid OPP table in Device tree\n"); - goto opp_cleanup; + goto opp_put_clkname; } /* Vote for maximum clock rate for maximum performance */ @@ -2470,8 +2467,8 @@ static int sdhci_msm_probe(struct platform_device *pdev) clk_bulk_disable_unprepare(ARRAY_SIZE(msm_host->bulk_clks), msm_host->bulk_clks); opp_cleanup: - if (msm_host->has_opp_table) - dev_pm_opp_of_remove_table(&pdev->dev); + dev_pm_opp_of_remove_table(&pdev->dev); +opp_put_clkname: dev_pm_opp_put_clkname(msm_host->opp_table); bus_clk_disable: if (!IS_ERR(msm_host->bus_clk)) @@ -2491,8 +2488,7 @@ static int sdhci_msm_remove(struct platform_device *pdev) sdhci_remove_host(host, dead); - if (msm_host->has_opp_table) - dev_pm_opp_of_remove_table(&pdev->dev); + dev_pm_opp_of_remove_table(&pdev->dev); dev_pm_opp_put_clkname(msm_host->opp_table); pm_runtime_get_sync(&pdev->dev); pm_runtime_disable(&pdev->dev); From 878dbe426a564b164e6d25d21d4bc29a02e7fc2c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Mon, 14 Sep 2020 13:28:45 +0200 Subject: [PATCH 078/271] mmc: core: clear 'doing_init_tune' also after failures MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Reorganize the code, so that the flag is always cleared independently of a good or bad case. Fixes: 97a7d87e96b0 ("mmc: core: add a 'doing_init_tune' flag and a 'mmc_doing_tune' helper") Signed-off-by: Wolfram Sang Reviewed-by: Niklas Söderlund Link: https://lore.kernel.org/r/20200914112845.21855-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/mmc.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/core/mmc.c b/drivers/mmc/core/mmc.c index 67e95eba0e82..ff3063ce2acd 100644 --- a/drivers/mmc/core/mmc.c +++ b/drivers/mmc/core/mmc.c @@ -1766,14 +1766,14 @@ static int mmc_init_card(struct mmc_host *host, u32 ocr, host->doing_init_tune = 1; err = mmc_hs200_tuning(card); - if (err) - goto free_card; - - err = mmc_select_hs400(card); - if (err) - goto free_card; + if (!err) + err = mmc_select_hs400(card); host->doing_init_tune = 0; + + if (err) + goto free_card; + } else if (!mmc_card_hs400es(card)) { /* Select the desired bus width optionally */ err = mmc_select_bus_width(card); From d578b46db69d125a654f509bdc9091d84e924dc8 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Tue, 11 Aug 2020 15:00:20 +0800 Subject: [PATCH 079/271] erofs: avoid duplicated permission check for "trusted." xattrs Don't recheck it since xattr_permission() already checks CAP_SYS_ADMIN capability. Just follow 5d3ce4f70172 ("f2fs: avoid duplicated permission check for "trusted." xattrs") Reported-by: Hongyu Jin [ Gao Xiang: since it could cause some complex Android overlay permission issue as well on android-5.4+, it'd be better to backport to 5.4+ rather than pure cleanup on mainline. ] Cc: # 5.4+ Link: https://lore.kernel.org/r/20200811070020.6339-1-hsiangkao@redhat.com Reviewed-by: Chao Yu Signed-off-by: Gao Xiang --- fs/erofs/xattr.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index c8c381eadcd6..5bde77d70852 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -473,8 +473,6 @@ static int erofs_xattr_generic_get(const struct xattr_handler *handler, return -EOPNOTSUPP; break; case EROFS_XATTR_INDEX_TRUSTED: - if (!capable(CAP_SYS_ADMIN)) - return -EPERM; break; case EROFS_XATTR_INDEX_SECURITY: break; From e3f78d5e7e6b0825f4e646f74b0e469b023e5df4 Mon Sep 17 00:00:00 2001 From: Chao Yu Date: Thu, 17 Sep 2020 09:18:21 +0800 Subject: [PATCH 080/271] erofs: remove unneeded parameter After commit 0615090c5044 ("erofs: convert compressed files from readpages to readahead"), add_to_page_cache_lru() was moved to mm code, so that in below call path, no page will be cached into @pagepool list or grabbed from @pagepool list: - z_erofs_readpage - z_erofs_do_read_page - preload_compressed_pages - erofs_allocpage Let's get rid of this unneeded @pagepool parameter. Signed-off-by: Chao Yu Link: https://lore.kernel.org/r/20200917011821.22767-1-yuchao0@huawei.com Reviewed-by: Gao Xiang Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 6c939def00f9..ac6cb73df192 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -153,8 +153,7 @@ static DEFINE_MUTEX(z_pagemap_global_lock); static void preload_compressed_pages(struct z_erofs_collector *clt, struct address_space *mc, - enum z_erofs_cache_alloctype type, - struct list_head *pagepool) + enum z_erofs_cache_alloctype type) { const struct z_erofs_pcluster *pcl = clt->pcl; const unsigned int clusterpages = BIT(pcl->clusterbits); @@ -562,8 +561,7 @@ static bool should_alloc_managed_pages(struct z_erofs_decompress_frontend *fe, } static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, - struct page *page, - struct list_head *pagepool) + struct page *page) { struct inode *const inode = fe->inode; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); @@ -620,8 +618,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, else cache_strategy = DONTALLOC; - preload_compressed_pages(clt, MNGD_MAPPING(sbi), - cache_strategy, pagepool); + preload_compressed_pages(clt, MNGD_MAPPING(sbi), cache_strategy); hitted: /* @@ -653,7 +650,7 @@ static int z_erofs_do_read_page(struct z_erofs_decompress_frontend *fe, /* should allocate an additional staging page for pagevec */ if (err == -EAGAIN) { struct page *const newpage = - erofs_allocpage(pagepool, GFP_NOFS | __GFP_NOFAIL); + alloc_page(GFP_NOFS | __GFP_NOFAIL); newpage->mapping = Z_EROFS_MAPPING_STAGING; err = z_erofs_attach_page(clt, newpage, @@ -1282,7 +1279,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) f.headoffset = (erofs_off_t)page->index << PAGE_SHIFT; - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ @@ -1341,7 +1338,7 @@ static void z_erofs_readahead(struct readahead_control *rac) /* traversal in reverse order */ head = (void *)page_private(page); - err = z_erofs_do_read_page(&f, page, &pagepool); + err = z_erofs_do_read_page(&f, page); if (err) erofs_err(inode->i_sb, "readahead error at page %lu @ nid %llu", From 6c3e485ea37da2bcca68e6b8c635ddd2b4da540d Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 19 Sep 2020 15:27:28 +0800 Subject: [PATCH 081/271] erofs: avoid unnecessary variable `err' variable `err' in z_erofs_submit_queue() isn't useful here, remove it instead. Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20200919072730.24989-1-hsiangkao@redhat.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index ac6cb73df192..e43684b23fdd 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1190,7 +1190,6 @@ static void z_erofs_submit_queue(struct super_block *sb, do { struct page *page; - int err; page = pickup_page_for_submission(pcl, i++, pagepool, MNGD_MAPPING(sbi), @@ -1216,8 +1215,7 @@ static void z_erofs_submit_queue(struct super_block *sb, ++nr_bios; } - err = bio_add_page(bio, page, PAGE_SIZE, 0); - if (err < PAGE_SIZE) + if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) goto submit_bio_retry; last_index = cur; From bf9a123b9cf8d8c67c4c59757cbf4e4b2c4ab2e3 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 19 Sep 2020 15:27:29 +0800 Subject: [PATCH 082/271] erofs: fold in should_decompress_synchronously() should_decompress_synchronously() has one single condition for now, so fold it instead. Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20200919072730.24989-2-hsiangkao@redhat.com Signed-off-by: Gao Xiang --- fs/erofs/zdata.c | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index e43684b23fdd..bee6ce783c64 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -1294,24 +1294,18 @@ static int z_erofs_readpage(struct file *file, struct page *page) return err; } -static bool should_decompress_synchronously(struct erofs_sb_info *sbi, - unsigned int nr) -{ - return nr <= sbi->ctx.max_sync_decompress_pages; -} - static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; struct erofs_sb_info *const sbi = EROFS_I_SB(inode); - bool sync = should_decompress_synchronously(sbi, readahead_count(rac)); + unsigned int nr_pages = readahead_count(rac); + bool sync = (nr_pages <= sbi->ctx.max_sync_decompress_pages); struct z_erofs_decompress_frontend f = DECOMPRESS_FRONTEND_INIT(inode); struct page *page, *head = NULL; LIST_HEAD(pagepool); - trace_erofs_readpages(inode, readahead_index(rac), - readahead_count(rac), false); + trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); f.headoffset = readahead_pos(rac); From 6ea5aad32dd8b35af68b1ba955daa368900d3b98 Mon Sep 17 00:00:00 2001 From: Gao Xiang Date: Sat, 19 Sep 2020 15:27:30 +0800 Subject: [PATCH 083/271] erofs: add REQ_RAHEAD flag to readahead requests Let's add REQ_RAHEAD flag so it'd be easier to identify readahead I/O requests in blktrace. Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20200919072730.24989-3-hsiangkao@redhat.com Signed-off-by: Gao Xiang --- fs/erofs/data.c | 2 +- fs/erofs/zdata.c | 17 +++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 459ecb42cbd3..347be146884c 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -224,7 +224,7 @@ static inline struct bio *erofs_read_raw_page(struct bio *bio, bio_set_dev(bio, sb->s_bdev); bio->bi_iter.bi_sector = (sector_t)blknr << LOG_SECTORS_PER_BLOCK; - bio->bi_opf = REQ_OP_READ; + bio->bi_opf = REQ_OP_READ | (ra ? REQ_RAHEAD : 0); } err = bio_add_page(bio, page, PAGE_SIZE, 0); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index bee6ce783c64..50912a5420b4 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -135,6 +135,7 @@ struct z_erofs_decompress_frontend { struct z_erofs_collector clt; struct erofs_map_blocks map; + bool readahead; /* used for applying cache strategy on the fly */ bool backmost; erofs_off_t headoffset; @@ -1148,7 +1149,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, } static void z_erofs_submit_queue(struct super_block *sb, - z_erofs_next_pcluster_t owned_head, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, struct z_erofs_decompressqueue *fgq, bool *force_fg) @@ -1157,6 +1158,7 @@ static void z_erofs_submit_queue(struct super_block *sb, z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; void *bi_private; + z_erofs_next_pcluster_t owned_head = f->clt.owned_head; /* since bio will be NULL, no need to initialize last_index */ pgoff_t last_index; unsigned int nr_bios = 0; @@ -1212,6 +1214,8 @@ static void z_erofs_submit_queue(struct super_block *sb, LOG_SECTORS_PER_BLOCK; bio->bi_private = bi_private; bio->bi_opf = REQ_OP_READ; + if (f->readahead) + bio->bi_opf |= REQ_RAHEAD; ++nr_bios; } @@ -1243,14 +1247,14 @@ static void z_erofs_submit_queue(struct super_block *sb, } static void z_erofs_runqueue(struct super_block *sb, - struct z_erofs_collector *clt, + struct z_erofs_decompress_frontend *f, struct list_head *pagepool, bool force_fg) { struct z_erofs_decompressqueue io[NR_JOBQUEUES]; - if (clt->owned_head == Z_EROFS_PCLUSTER_TAIL) + if (f->clt.owned_head == Z_EROFS_PCLUSTER_TAIL) return; - z_erofs_submit_queue(sb, clt->owned_head, pagepool, io, &force_fg); + z_erofs_submit_queue(sb, f, pagepool, io, &force_fg); /* handle bypass queue (no i/o pclusters) immediately */ z_erofs_decompress_queue(&io[JQ_BYPASS], pagepool); @@ -1281,7 +1285,7 @@ static int z_erofs_readpage(struct file *file, struct page *page) (void)z_erofs_collector_end(&f.clt); /* if some compressed cluster ready, need submit them anyway */ - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, true); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, true); if (err) erofs_err(inode->i_sb, "failed to read, err [%d]", err); @@ -1307,6 +1311,7 @@ static void z_erofs_readahead(struct readahead_control *rac) trace_erofs_readpages(inode, readahead_index(rac), nr_pages, false); + f.readahead = true; f.headoffset = readahead_pos(rac); while ((page = readahead_page(rac))) { @@ -1340,7 +1345,7 @@ static void z_erofs_readahead(struct readahead_control *rac) (void)z_erofs_collector_end(&f.clt); - z_erofs_runqueue(inode->i_sb, &f.clt, &pagepool, sync); + z_erofs_runqueue(inode->i_sb, &f, &pagepool, sync); if (f.map.mpage) put_page(f.map.mpage); From 7931b2d56ea4f95ed54d578b483f7c0a28c84a70 Mon Sep 17 00:00:00 2001 From: Amit Singh Tomar Date: Fri, 11 Sep 2020 10:54:09 +0530 Subject: [PATCH 084/271] dt-bindings: mmc: owl: add compatible string actions,s700-mmc The commit adds a new SoC specific compatible string "actions,s700-mmc" in combination with more generic string "actions,owl-mmc". Placement order of these strings should abide by the principle of "from most specific to most general". Reviewed-by: Manivannan Sadhasivam Reviewed-by: Rob Herring Signed-off-by: Amit Singh Tomar Link: https://lore.kernel.org/r/1599801849-6071-1-git-send-email-amittomer25@gmail.com Signed-off-by: Ulf Hansson --- Documentation/devicetree/bindings/mmc/owl-mmc.yaml | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/Documentation/devicetree/bindings/mmc/owl-mmc.yaml b/Documentation/devicetree/bindings/mmc/owl-mmc.yaml index 1380501fb8f0..5eab25ccf7ae 100644 --- a/Documentation/devicetree/bindings/mmc/owl-mmc.yaml +++ b/Documentation/devicetree/bindings/mmc/owl-mmc.yaml @@ -14,7 +14,11 @@ maintainers: properties: compatible: - const: actions,owl-mmc + oneOf: + - const: actions,owl-mmc + - items: + - const: actions,s700-mmc + - const: actions,owl-mmc reg: maxItems: 1 From 0caf60c4b110b5aea9a0700be277421c1f575041 Mon Sep 17 00:00:00 2001 From: Amey Narkhede Date: Fri, 18 Sep 2020 00:56:25 +0530 Subject: [PATCH 085/271] mmc: mediatek: Drop pointer to mmc_host from msdc_host The MediaTek MMC driver uses pointer to get from private msdc_host structure to the generic mmc_host structure. However mmc_host always precedes msdc_host in memory so compute its address with a subtraction (which is cheaper than a dereference) using mmc_from_priv() and drop the extra pointer. Signed-off-by: Amey Narkhede Link: https://lore.kernel.org/r/20200917192624.548720-1-ameynarkhede03@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/mtk-sd.c | 45 ++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 20 deletions(-) diff --git a/drivers/mmc/host/mtk-sd.c b/drivers/mmc/host/mtk-sd.c index 691c928edab7..a704745e5882 100644 --- a/drivers/mmc/host/mtk-sd.c +++ b/drivers/mmc/host/mtk-sd.c @@ -397,7 +397,6 @@ struct msdc_delay_phase { struct msdc_host { struct device *dev; const struct mtk_mmc_compatible *dev_comp; - struct mmc_host *mmc; /* mmc structure */ int cmd_rsp; spinlock_t lock; @@ -734,14 +733,15 @@ static void msdc_unprepare_data(struct msdc_host *host, struct mmc_request *mrq) static u64 msdc_timeout_cal(struct msdc_host *host, u64 ns, u64 clks) { + struct mmc_host *mmc = mmc_from_priv(host); u64 timeout, clk_ns; u32 mode = 0; - if (host->mmc->actual_clock == 0) { + if (mmc->actual_clock == 0) { timeout = 0; } else { clk_ns = 1000000000ULL; - do_div(clk_ns, host->mmc->actual_clock); + do_div(clk_ns, mmc->actual_clock); timeout = ns + clk_ns - 1; do_div(timeout, clk_ns); timeout += clks; @@ -802,6 +802,7 @@ static void msdc_ungate_clock(struct msdc_host *host) static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) { + struct mmc_host *mmc = mmc_from_priv(host); u32 mode; u32 flags; u32 div; @@ -811,7 +812,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) if (!hz) { dev_dbg(host->dev, "set mclk to 0\n"); host->mclk = 0; - host->mmc->actual_clock = 0; + mmc->actual_clock = 0; sdr_clr_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); return; } @@ -890,7 +891,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) while (!(readl(host->base + MSDC_CFG) & MSDC_CFG_CKSTB)) cpu_relax(); sdr_set_bits(host->base + MSDC_CFG, MSDC_CFG_CKPDN); - host->mmc->actual_clock = sclk; + mmc->actual_clock = sclk; host->mclk = hz; host->timing = timing; /* need because clk changed. */ @@ -901,7 +902,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) * mmc_select_hs400() will drop to 50Mhz and High speed mode, * tune result of hs200/200Mhz is not suitable for 50Mhz */ - if (host->mmc->actual_clock <= 52000000) { + if (mmc->actual_clock <= 52000000) { writel(host->def_tune_para.iocon, host->base + MSDC_IOCON); if (host->top_base) { writel(host->def_tune_para.emmc_top_control, @@ -932,7 +933,7 @@ static void msdc_set_mclk(struct msdc_host *host, unsigned char timing, u32 hz) sdr_set_field(host->base + tune_reg, MSDC_PAD_TUNE_CMDRRDLY, host->hs400_cmd_int_delay); - dev_dbg(host->dev, "sclk: %d, timing: %d\n", host->mmc->actual_clock, + dev_dbg(host->dev, "sclk: %d, timing: %d\n", mmc->actual_clock, timing); } @@ -967,6 +968,7 @@ static inline u32 msdc_cmd_find_resp(struct msdc_host *host, static inline u32 msdc_cmd_prepare_raw_cmd(struct msdc_host *host, struct mmc_request *mrq, struct mmc_command *cmd) { + struct mmc_host *mmc = mmc_from_priv(host); /* rawcmd : * vol_swt << 30 | auto_cmd << 28 | blklen << 16 | go_irq << 15 | * stop << 14 | rw << 13 | dtype << 11 | rsptyp << 7 | brk << 6 | opcode @@ -993,7 +995,7 @@ static inline u32 msdc_cmd_prepare_raw_cmd(struct msdc_host *host, struct mmc_data *data = cmd->data; if (mmc_op_multi(opcode)) { - if (mmc_card_mmc(host->mmc->card) && mrq->sbc && + if (mmc_card_mmc(mmc->card) && mrq->sbc && !(mrq->sbc->arg & 0xFFFF0000)) rawcmd |= 0x2 << 28; /* AutoCMD23 */ } @@ -1070,9 +1072,10 @@ static int msdc_auto_cmd_done(struct msdc_host *host, int events, */ static void msdc_recheck_sdio_irq(struct msdc_host *host) { + struct mmc_host *mmc = mmc_from_priv(host); u32 reg_int, reg_inten, reg_ps; - if (host->mmc->caps & MMC_CAP_SDIO_IRQ) { + if (mmc->caps & MMC_CAP_SDIO_IRQ) { reg_inten = readl(host->base + MSDC_INTEN); if (reg_inten & MSDC_INTEN_SDIOIRQ) { reg_int = readl(host->base + MSDC_INT); @@ -1080,7 +1083,7 @@ static void msdc_recheck_sdio_irq(struct msdc_host *host) if (!(reg_int & MSDC_INT_SDIOIRQ || reg_ps & MSDC_PS_DATA1)) { __msdc_enable_sdio_irq(host, 0); - sdio_signal_irq(host->mmc); + sdio_signal_irq(mmc); } } } @@ -1113,7 +1116,7 @@ static void msdc_request_done(struct msdc_host *host, struct mmc_request *mrq) msdc_unprepare_data(host, mrq); if (host->error) msdc_reset_hw(host); - mmc_request_done(host->mmc, mrq); + mmc_request_done(mmc_from_priv(host), mrq); if (host->dev_comp->recheck_sdio_irq) msdc_recheck_sdio_irq(host); } @@ -1500,6 +1503,7 @@ static void msdc_enable_sdio_irq(struct mmc_host *mmc, int enb) static irqreturn_t msdc_cmdq_irq(struct msdc_host *host, u32 intsts) { + struct mmc_host *mmc = mmc_from_priv(host); int cmd_err = 0, dat_err = 0; if (intsts & MSDC_INT_RSPCRCERR) { @@ -1523,12 +1527,13 @@ static irqreturn_t msdc_cmdq_irq(struct msdc_host *host, u32 intsts) cmd_err, dat_err, intsts); } - return cqhci_irq(host->mmc, 0, cmd_err, dat_err); + return cqhci_irq(mmc, 0, cmd_err, dat_err); } static irqreturn_t msdc_irq(int irq, void *dev_id) { struct msdc_host *host = (struct msdc_host *) dev_id; + struct mmc_host *mmc = mmc_from_priv(host); while (true) { unsigned long flags; @@ -1551,18 +1556,18 @@ static irqreturn_t msdc_irq(int irq, void *dev_id) spin_unlock_irqrestore(&host->lock, flags); if ((events & event_mask) & MSDC_INT_SDIOIRQ) - sdio_signal_irq(host->mmc); + sdio_signal_irq(mmc); if ((events & event_mask) & MSDC_INT_CDSC) { if (host->internal_cd) - mmc_detect_change(host->mmc, msecs_to_jiffies(20)); + mmc_detect_change(mmc, msecs_to_jiffies(20)); events &= ~MSDC_INT_CDSC; } if (!(events & (event_mask & ~MSDC_INT_SDIOIRQ))) break; - if ((host->mmc->caps2 & MMC_CAP2_CQE) && + if ((mmc->caps2 & MMC_CAP2_CQE) && (events & MSDC_INT_CMDQ)) { msdc_cmdq_irq(host, events); /* clear interrupts */ @@ -2456,7 +2461,6 @@ static int msdc_drv_probe(struct platform_device *pdev) host->dev = &pdev->dev; host->dev_comp = of_device_get_match_data(&pdev->dev); - host->mmc = mmc; host->src_clk_freq = clk_get_rate(host->src_clk); /* Set host parameters to mmc */ mmc->ops = &mt_msdc_ops; @@ -2497,7 +2501,7 @@ static int msdc_drv_probe(struct platform_device *pdev) mmc_dev(mmc)->dma_mask = &host->dma_mask; if (mmc->caps2 & MMC_CAP2_CQE) { - host->cq_host = devm_kzalloc(host->mmc->parent, + host->cq_host = devm_kzalloc(mmc->parent, sizeof(*host->cq_host), GFP_KERNEL); if (!host->cq_host) { @@ -2582,7 +2586,7 @@ static int msdc_drv_remove(struct platform_device *pdev) pm_runtime_get_sync(host->dev); platform_set_drvdata(pdev, NULL); - mmc_remove_host(host->mmc); + mmc_remove_host(mmc); msdc_deinit_hw(host); msdc_gate_clock(host); @@ -2594,7 +2598,7 @@ static int msdc_drv_remove(struct platform_device *pdev) dma_free_coherent(&pdev->dev, MAX_BD_NUM * sizeof(struct mt_bdma_desc), host->dma.bd, host->dma.bd_addr); - mmc_free_host(host->mmc); + mmc_free_host(mmc); return 0; } @@ -2629,6 +2633,7 @@ static void msdc_save_reg(struct msdc_host *host) static void msdc_restore_reg(struct msdc_host *host) { + struct mmc_host *mmc = mmc_from_priv(host); u32 tune_reg = host->dev_comp->pad_tune_reg; writel(host->save_para.msdc_cfg, host->base + MSDC_CFG); @@ -2653,7 +2658,7 @@ static void msdc_restore_reg(struct msdc_host *host) writel(host->save_para.pad_tune, host->base + tune_reg); } - if (sdio_irq_claimed(host->mmc)) + if (sdio_irq_claimed(mmc)) __msdc_enable_sdio_irq(host, 1); } From 3439c588c23c7058b184240b11ffe7edd7b8ac7c Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Fri, 18 Sep 2020 23:54:46 +0200 Subject: [PATCH 086/271] mmc: core: document mmc_hw_reset() Add documentation for mmc_hw_reset to make sure the intended use case is clear. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20200918215446.65654-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/core/core.c | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/drivers/mmc/core/core.c b/drivers/mmc/core/core.c index 8ccae6452b9c..d42037f0f10d 100644 --- a/drivers/mmc/core/core.c +++ b/drivers/mmc/core/core.c @@ -2063,6 +2063,16 @@ static void mmc_hw_reset_for_init(struct mmc_host *host) host->ops->hw_reset(host); } +/** + * mmc_hw_reset - reset the card in hardware + * @host: MMC host to which the card is attached + * + * Hard reset the card. This function is only for upper layers, like the + * block layer or card drivers. You cannot use it in host drivers (struct + * mmc_card might be gone then). + * + * Return: 0 on success, -errno on failure + */ int mmc_hw_reset(struct mmc_host *host) { int ret; From 8dae6a249c635cdc637ae07761beef866480ca0e Mon Sep 17 00:00:00 2001 From: Qinglang Miao Date: Mon, 21 Sep 2020 21:10:42 +0800 Subject: [PATCH 087/271] mmc: rtsx_usb_sdmmc: simplify the return expression of sd_change_phase() Simplify the return expression. Signed-off-by: Qinglang Miao Link: https://lore.kernel.org/r/20200921131042.92340-1-miaoqinglang@huawei.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/rtsx_usb_sdmmc.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/mmc/host/rtsx_usb_sdmmc.c b/drivers/mmc/host/rtsx_usb_sdmmc.c index 598f49573f5d..5fe4528e296e 100644 --- a/drivers/mmc/host/rtsx_usb_sdmmc.c +++ b/drivers/mmc/host/rtsx_usb_sdmmc.c @@ -579,7 +579,6 @@ static void sd_normal_rw(struct rtsx_usb_sdmmc *host, static int sd_change_phase(struct rtsx_usb_sdmmc *host, u8 sample_point, int tx) { struct rtsx_ucr *ucr = host->ucr; - int err; dev_dbg(sdmmc_dev(host), "%s: %s sample_point = %d\n", __func__, tx ? "TX" : "RX", sample_point); @@ -601,11 +600,7 @@ static int sd_change_phase(struct rtsx_usb_sdmmc *host, u8 sample_point, int tx) rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, CLK_DIV, CLK_CHANGE, 0); rtsx_usb_add_cmd(ucr, WRITE_REG_CMD, SD_CFG1, SD_ASYNC_FIFO_RST, 0); - err = rtsx_usb_send_cmd(ucr, MODE_C, 100); - if (err) - return err; - - return 0; + return rtsx_usb_send_cmd(ucr, MODE_C, 100); } static inline u32 get_phase_point(u32 phase_map, unsigned int idx) From fbb31330f9b00bf62fe126a1ea118c02389b06f0 Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Tue, 22 Sep 2020 19:22:53 +0200 Subject: [PATCH 088/271] mmc: renesas_sdhi: drop local flag for tuning The MMC core has now a generic check if some tuning is in progress. Its protected area is a bit larger than the custom one in this driver but we concluded that this works equally well for the intended case. So, drop the local flag and switch to the generic one. Signed-off-by: Wolfram Sang Reviewed-by: Yoshihiro Shimoda Tested-by: Yoshihiro Shimoda Link: https://lore.kernel.org/r/20200922172253.4458-1-wsa@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi.h | 1 - drivers/mmc/host/renesas_sdhi_core.c | 4 +--- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/drivers/mmc/host/renesas_sdhi.h b/drivers/mmc/host/renesas_sdhi.h index 24958de274c1..cb962c7883dc 100644 --- a/drivers/mmc/host/renesas_sdhi.h +++ b/drivers/mmc/host/renesas_sdhi.h @@ -63,7 +63,6 @@ struct renesas_sdhi { u32 scc_tappos_hs400; const u8 *adjust_hs400_calib_table; bool needs_adjust_hs400; - bool doing_tune; /* Tuning values: 1 for success, 0 for failure */ DECLARE_BITMAP(taps, BITS_PER_LONG); diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index d621a4af8e87..20e5eb63caf8 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -581,7 +581,6 @@ static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) unsigned int taps_size = priv->tap_num * 2, min_tap_row; unsigned long *bitmap; - priv->doing_tune = false; sd_scc_write32(host, priv, SH_MOBILE_SDHI_SCC_RVSREQ, 0); /* @@ -656,7 +655,6 @@ static int renesas_sdhi_execute_tuning(struct mmc_host *mmc, u32 opcode) return -EINVAL; } - priv->doing_tune = true; bitmap_zero(priv->taps, priv->tap_num * 2); bitmap_zero(priv->smpcmp, priv->tap_num * 2); @@ -765,7 +763,7 @@ static bool renesas_sdhi_check_scc_error(struct tmio_mmc_host *host) !(host->mmc->ios.timing == MMC_TIMING_MMC_HS400 && !use_4tap)) return false; - if (mmc_doing_retune(host->mmc) || priv->doing_tune) + if (mmc_doing_tune(host->mmc)) return false; if (sd_scc_read32(host, priv, SH_MOBILE_SDHI_SCC_RVSCNTL) & From 6b28f2c4da7e196062d84b143294cf6d86f6e02c Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 23 Sep 2020 17:37:38 +0200 Subject: [PATCH 089/271] mmc: moxart: remove unneeded check for drvdata The 'struct mmc_host *mmc' comes from drvdata set at the end of probe, so it cannot be NULL. The code already dereferences it few lines before the check with mmc_priv(). This also fixes smatch warning: drivers/mmc/host/moxart-mmc.c:692 moxart_remove() warn: variable dereferenced before check 'mmc' (see line 688) Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200923153739.30327-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/moxart-mmc.c | 23 +++++++++++------------ 1 file changed, 11 insertions(+), 12 deletions(-) diff --git a/drivers/mmc/host/moxart-mmc.c b/drivers/mmc/host/moxart-mmc.c index 2bfb376fddc4..f25079ba3bca 100644 --- a/drivers/mmc/host/moxart-mmc.c +++ b/drivers/mmc/host/moxart-mmc.c @@ -689,19 +689,18 @@ static int moxart_remove(struct platform_device *pdev) dev_set_drvdata(&pdev->dev, NULL); - if (mmc) { - if (!IS_ERR(host->dma_chan_tx)) - dma_release_channel(host->dma_chan_tx); - if (!IS_ERR(host->dma_chan_rx)) - dma_release_channel(host->dma_chan_rx); - mmc_remove_host(mmc); - mmc_free_host(mmc); + if (!IS_ERR(host->dma_chan_tx)) + dma_release_channel(host->dma_chan_tx); + if (!IS_ERR(host->dma_chan_rx)) + dma_release_channel(host->dma_chan_rx); + mmc_remove_host(mmc); + mmc_free_host(mmc); + + writel(0, host->base + REG_INTERRUPT_MASK); + writel(0, host->base + REG_POWER_CONTROL); + writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, + host->base + REG_CLOCK_CONTROL); - writel(0, host->base + REG_INTERRUPT_MASK); - writel(0, host->base + REG_POWER_CONTROL); - writel(readl(host->base + REG_CLOCK_CONTROL) | CLK_OFF, - host->base + REG_CLOCK_CONTROL); - } return 0; } From 0cb231f1e034bfbb3ccbaf759721d136a6847dde Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Wed, 23 Sep 2020 17:37:39 +0200 Subject: [PATCH 090/271] mmc: sdhci: fix indentation mistakes Fix inconsistent indenting, reported by Smatch: drivers/mmc/host/sdhci-esdhc-imx.c:1380 sdhci_esdhc_imx_hwinit() warn: inconsistent indenting drivers/mmc/host/sdhci-sprd.c:390 sdhci_sprd_request_done() warn: inconsistent indenting Reported-by: kernel test robot Reported-by: Dan Carpenter Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200923153739.30327-2-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-esdhc-imx.c | 2 +- drivers/mmc/host/sdhci-sprd.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/mmc/host/sdhci-esdhc-imx.c b/drivers/mmc/host/sdhci-esdhc-imx.c index 62b3d4767916..fce8fa7e6b30 100644 --- a/drivers/mmc/host/sdhci-esdhc-imx.c +++ b/drivers/mmc/host/sdhci-esdhc-imx.c @@ -1377,7 +1377,7 @@ static void sdhci_esdhc_imx_hwinit(struct sdhci_host *host) * response, block the tuning procedure or the first command * after the whole tuning procedure always can't get any response. */ - tmp |= ESDHC_TUNING_CMD_CRC_CHECK_DISABLE; + tmp |= ESDHC_TUNING_CMD_CRC_CHECK_DISABLE; writel(tmp, host->ioaddr + ESDHC_TUNING_CTRL); } else if (imx_data->socdata->flags & ESDHC_FLAG_MAN_TUNING) { /* diff --git a/drivers/mmc/host/sdhci-sprd.c b/drivers/mmc/host/sdhci-sprd.c index 1efaf602c206..58109c5b53e2 100644 --- a/drivers/mmc/host/sdhci-sprd.c +++ b/drivers/mmc/host/sdhci-sprd.c @@ -387,7 +387,7 @@ static void sdhci_sprd_request_done(struct sdhci_host *host, if (mmc_hsq_finalize_request(host->mmc, mrq)) return; - mmc_request_done(host->mmc, mrq); + mmc_request_done(host->mmc, mrq); } static struct sdhci_ops sdhci_sprd_ops = { @@ -433,7 +433,7 @@ static void sdhci_sprd_request(struct mmc_host *mmc, struct mmc_request *mrq) } static int sdhci_sprd_request_atomic(struct mmc_host *mmc, - struct mmc_request *mrq) + struct mmc_request *mrq) { sdhci_sprd_check_auto_cmd23(mmc, mrq); From 94d4c3cffefc702da2f906bfc68b774659f1c8fc Mon Sep 17 00:00:00 2001 From: Krzysztof Kozlowski Date: Fri, 25 Sep 2020 09:25:32 +0200 Subject: [PATCH 091/271] mmc: sdhci-s3c: hide forward declaration of of_device_id behind CONFIG_OF The struct of_device_id is not defined with !CONFIG_OF so its forward declaration should be hidden to as well. This should address clang compile warning: drivers/mmc/host/sdhci-s3c.c:464:34: warning: tentative array definition assumed to have one element Reported-by: kernel test robot Signed-off-by: Krzysztof Kozlowski Link: https://lore.kernel.org/r/20200925072532.10272-1-krzk@kernel.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-s3c.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/drivers/mmc/host/sdhci-s3c.c b/drivers/mmc/host/sdhci-s3c.c index cb5f87be7535..f48a788a9d3d 100644 --- a/drivers/mmc/host/sdhci-s3c.c +++ b/drivers/mmc/host/sdhci-s3c.c @@ -461,7 +461,9 @@ static int sdhci_s3c_parse_dt(struct device *dev, } #endif +#ifdef CONFIG_OF static const struct of_device_id sdhci_s3c_dt_match[]; +#endif static inline struct sdhci_s3c_drv_data *sdhci_s3c_get_driver_data( struct platform_device *pdev) From 3157b035f499e48334be1ec74b297721b82adb4b Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Fri, 25 Sep 2020 09:43:23 -0700 Subject: [PATCH 092/271] mmc: host: fix depends for MMC_MESON_GX w/ COMPILE_TEST Fix build errors for meson-gx-mmc.c when CONFIG_COMMON_CLK is not set/enabled. This can happen when COMPILE_TEST is set/enabled. ERROR: modpost: "clk_divider_ops" [drivers/mmc/host/meson-gx-mmc.ko] undefined! ERROR: modpost: "devm_clk_register" [drivers/mmc/host/meson-gx-mmc.ko] undefined! ERROR: modpost: "clk_mux_ops" [drivers/mmc/host/meson-gx-mmc.ko] undefined! ERROR: modpost: "__clk_get_name" [drivers/mmc/host/meson-gx-mmc.ko] undefined! Fixes: 54d8454436a2 ("mmc: host: Enable compile testing of multiple drivers") Signed-off-by: Randy Dunlap Reviewed-by: Krzysztof Kozlowski Reported-by: kernel test robot Link: https://lore.kernel.org/r/20200925164323.29843-1-rdunlap@infradead.org Signed-off-by: Ulf Hansson --- drivers/mmc/host/Kconfig | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/Kconfig b/drivers/mmc/host/Kconfig index 185a2564e623..f0cb7aeabbc4 100644 --- a/drivers/mmc/host/Kconfig +++ b/drivers/mmc/host/Kconfig @@ -425,6 +425,7 @@ config MMC_SDHCI_IPROC config MMC_MESON_GX tristate "Amlogic S905/GX*/AXG SD/MMC Host Controller support" depends on ARCH_MESON|| COMPILE_TEST + depends on COMMON_CLK help This selects support for the Amlogic SD/MMC Host Controller found on the S905/GX*/AXG family of SoCs. This controller is From 975520fc73809a3a0904f0c67d8640945696e798 Mon Sep 17 00:00:00 2001 From: Yangbo Lu Date: Sun, 27 Sep 2020 16:23:04 +0800 Subject: [PATCH 093/271] mmc: sdhci-of-esdhc: fix reference clock source selection The bit ESDHC_PERIPHERAL_CLK_SEL to select using peripheral clock or platform clock is not able to be reset by SDHCI_RESET_ALL. So driver needs to initialize it as 1 or 0 once, to override the different value which may be configured in bootloader. Signed-off-by: Yangbo Lu Link: https://lore.kernel.org/r/20200927082304.9232-1-yangbo.lu@nxp.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-of-esdhc.c | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/drivers/mmc/host/sdhci-of-esdhc.c b/drivers/mmc/host/sdhci-of-esdhc.c index 3a3340c9b8c9..0b45eff6fed4 100644 --- a/drivers/mmc/host/sdhci-of-esdhc.c +++ b/drivers/mmc/host/sdhci-of-esdhc.c @@ -1360,13 +1360,19 @@ static void esdhc_init(struct platform_device *pdev, struct sdhci_host *host) clk_put(clk); } - if (esdhc->peripheral_clock) { - esdhc_clock_enable(host, false); - val = sdhci_readl(host, ESDHC_DMA_SYSCTL); + esdhc_clock_enable(host, false); + val = sdhci_readl(host, ESDHC_DMA_SYSCTL); + /* + * This bit is not able to be reset by SDHCI_RESET_ALL. Need to + * initialize it as 1 or 0 once, to override the different value + * which may be configured in bootloader. + */ + if (esdhc->peripheral_clock) val |= ESDHC_PERIPHERAL_CLK_SEL; - sdhci_writel(host, val, ESDHC_DMA_SYSCTL); - esdhc_clock_enable(host, true); - } + else + val &= ~ESDHC_PERIPHERAL_CLK_SEL; + sdhci_writel(host, val, ESDHC_DMA_SYSCTL); + esdhc_clock_enable(host, true); } static int esdhc_hs400_prepare_ddr(struct mmc_host *mmc) From 407d0c2cdd12b5e72695686d4097e7f1331a093e Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Wed, 23 Sep 2020 16:22:01 +0530 Subject: [PATCH 094/271] dt-bindings: mmc: sdhci-am654: Convert sdhci-am654 controller documentation to json schema Convert sdhci-am654 documentation to yaml format. The new file sdhci-am654.yaml will inherit from mmc-controller.yaml. Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20200923105206.7988-2-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/sdhci-am654.txt | 65 ------- .../devicetree/bindings/mmc/sdhci-am654.yaml | 175 ++++++++++++++++++ 2 files changed, 175 insertions(+), 65 deletions(-) delete mode 100644 Documentation/devicetree/bindings/mmc/sdhci-am654.txt create mode 100644 Documentation/devicetree/bindings/mmc/sdhci-am654.yaml diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt b/Documentation/devicetree/bindings/mmc/sdhci-am654.txt deleted file mode 100644 index b49cbfdd679f..000000000000 --- a/Documentation/devicetree/bindings/mmc/sdhci-am654.txt +++ /dev/null @@ -1,65 +0,0 @@ -Device Tree Bindings for the SDHCI Controllers present on TI's AM654 SOCs - -The bindings follow the mmc[1], clock[2] and interrupt[3] bindings. -Only deviations are documented here. - - [1] Documentation/devicetree/bindings/mmc/mmc.txt - [2] Documentation/devicetree/bindings/clock/clock-bindings.txt - [3] Documentation/devicetree/bindings/interrupt-controller/interrupts.txt - -Required Properties: - - compatible: should be one of: - "ti,am654-sdhci-5.1": SDHCI on AM654 device. - "ti,j721e-sdhci-8bit": 8 bit SDHCI on J721E device. - "ti,j721e-sdhci-4bit": 4 bit SDHCI on J721E device. - "ti,j7200-sdhci-8bit": 8 bit SDHCI on J7200 device. - "ti,j7200-sdhci-4bit": 4 bit SDHCI on J7200 device. - - reg: Must be two entries. - - The first should be the sdhci register space - - The second should the subsystem/phy register space - - clocks: Handles to the clock inputs. - - clock-names: Tuple including "clk_xin" and "clk_ahb" - - interrupts: Interrupt specifiers - Output tap delay for each speed mode: - - ti,otap-del-sel-legacy - - ti,otap-del-sel-mmc-hs - - ti,otap-del-sel-sd-hs - - ti,otap-del-sel-sdr12 - - ti,otap-del-sel-sdr25 - - ti,otap-del-sel-sdr50 - - ti,otap-del-sel-sdr104 - - ti,otap-del-sel-ddr50 - - ti,otap-del-sel-ddr52 - - ti,otap-del-sel-hs200 - - ti,otap-del-sel-hs400 - These bindings must be provided otherwise the driver will disable the - corresponding speed mode (i.e. all nodes must provide at least -legacy) - -Optional Properties (Required for ti,am654-sdhci-5.1, - ti,j721e-sdhci-8bit, - ti,j7200-sdhci-8bit): - - ti,trm-icp: DLL trim select - - ti,driver-strength-ohm: driver strength in ohms. - Valid values are 33, 40, 50, 66 and 100 ohms. -Optional Properties: - - ti,strobe-sel: strobe select delay for HS400 speed mode. Default value: 0x0. - - ti,clkbuf-sel: Clock Delay Buffer Select - -Example: - - sdhci0: sdhci@4f80000 { - compatible = "ti,am654-sdhci-5.1"; - reg = <0x0 0x4f80000 0x0 0x260>, <0x0 0x4f90000 0x0 0x134>; - power-domains = <&k3_pds 47>; - clocks = <&k3_clks 47 0>, <&k3_clks 47 1>; - clock-names = "clk_ahb", "clk_xin"; - interrupts = ; - sdhci-caps-mask = <0x80000007 0x0>; - mmc-ddr-1_8v; - ti,otap-del-sel-legacy = <0x0>; - ti,otap-del-sel-mmc-hs = <0x0>; - ti,otap-del-sel-ddr52 = <0x5>; - ti,otap-del-sel-hs200 = <0x5>; - ti,otap-del-sel-hs400 = <0x0>; - ti,trm-icp = <0x8>; - }; diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml new file mode 100644 index 000000000000..c222e057eca9 --- /dev/null +++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml @@ -0,0 +1,175 @@ +# SPDX-License-Identifier: (GPL-2.0-only OR BSD-2-Clause) +# Copyright (C) 2020 Texas Instruments Incorporated - http://www.ti.com/ +%YAML 1.2 +--- +$id: "http://devicetree.org/schemas/mmc/sdhci-am654.yaml#" +$schema : "http://devicetree.org/meta-schemas/core.yaml#" + +title: TI AM654 MMC Controller + +maintainers: + - Ulf Hansson + +allOf: + - $ref: mmc-controller.yaml# + +properties: + compatible: + enum: + - ti,am654-sdhci-5.1 + - ti,j721e-sdhci-8bit + - ti,j721e-sdhci-4bit + - ti,j7200-sdhci-8bit + - ti,j721e-sdhci-4bit + + reg: + maxItems: 2 + + interrupts: + maxItems: 1 + + power-domains: + maxItems: 1 + + clocks: + minItems: 1 + maxItems: 2 + description: Handles to input clocks + + clock-names: + minItems: 1 + maxItems: 2 + items: + - const: clk_ahb + - const: clk_xin + + # PHY output tap delays: + # Used to delay the data valid window and align it to the sampling clock. + # Binding needs to be provided for each supported speed mode otherwise the + # corresponding mode will be disabled. + + ti,otap-del-sel-legacy: + description: Output tap delay for SD/MMC legacy timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-mmc-hs: + description: Output tap delay for MMC high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sd-hs: + description: Output tap delay for SD high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr12: + description: Output tap delay for SD UHS SDR12 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr25: + description: Output tap delay for SD UHS SDR25 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr50: + description: Output tap delay for SD UHS SDR50 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-sdr104: + description: Output tap delay for SD UHS SDR104 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-ddr50: + description: Output tap delay for SD UHS DDR50 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-ddr52: + description: Output tap delay for eMMC DDR52 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-hs200: + description: Output tap delay for eMMC HS200 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,otap-del-sel-hs400: + description: Output tap delay for eMMC HS400 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,trm-icp: + description: DLL trim select + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0xf + + ti,driver-strength-ohm: + description: DLL drive strength in ohms + $ref: "/schemas/types.yaml#/definitions/uint32" + oneOf: + - enum: + - 33 + - 40 + - 50 + - 66 + - 100 + + ti,strobe-sel: + description: strobe select delay for HS400 speed mode. + $ref: "/schemas/types.yaml#/definitions/uint32" + + ti,clkbuf-sel: + description: Clock Delay Buffer Select + $ref: "/schemas/types.yaml#/definitions/uint32" + +required: + - compatible + - reg + - interrupts + - clocks + - clock-names + - ti,otap-del-sel-legacy + +examples: + - | + #include + #include + + bus { + #address-cells = <2>; + #size-cells = <2>; + + mmc0: mmc@4f80000 { + compatible = "ti,am654-sdhci-5.1"; + reg = <0x0 0x4f80000 0x0 0x260>, <0x0 0x4f90000 0x0 0x134>; + power-domains = <&k3_pds 47>; + clocks = <&k3_clks 47 0>, <&k3_clks 47 1>; + clock-names = "clk_ahb", "clk_xin"; + interrupts = ; + sdhci-caps-mask = <0x80000007 0x0>; + mmc-ddr-1_8v; + ti,otap-del-sel-legacy = <0x0>; + ti,otap-del-sel-mmc-hs = <0x0>; + ti,otap-del-sel-ddr52 = <0x5>; + ti,otap-del-sel-hs200 = <0x5>; + ti,otap-del-sel-hs400 = <0x0>; + ti,trm-icp = <0x8>; + }; + }; From b140954c5f5f6a595d9f474572a408caff558fa5 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Wed, 23 Sep 2020 16:22:02 +0530 Subject: [PATCH 095/271] dt-bindings: mmc: sdhci-am654: Add documentation for input tap delay Add documentation for input tap delay bindings. Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20200923105206.7988-3-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- .../devicetree/bindings/mmc/sdhci-am654.yaml | 43 +++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml index c222e057eca9..ac79f3adf20b 100644 --- a/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml +++ b/Documentation/devicetree/bindings/mmc/sdhci-am654.yaml @@ -114,6 +114,46 @@ properties: minimum: 0 maximum: 0xf + # PHY input tap delays: + # Used to delay the data valid window and align it to the sampling clock for + # modes that don't support tuning + + ti,itap-del-sel-legacy: + description: Input tap delay for SD/MMC legacy timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-mmc-hs: + description: Input tap delay for MMC high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-sd-hs: + description: Input tap delay for SD high speed timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-sdr12: + description: Input tap delay for SD UHS SDR12 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-sdr25: + description: Input tap delay for SD UHS SDR25 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + + ti,itap-del-sel-ddr52: + description: Input tap delay for MMC DDR52 timing + $ref: "/schemas/types.yaml#/definitions/uint32" + minimum: 0 + maximum: 0x1f + ti,trm-icp: description: DLL trim select $ref: "/schemas/types.yaml#/definitions/uint32" @@ -170,6 +210,9 @@ examples: ti,otap-del-sel-ddr52 = <0x5>; ti,otap-del-sel-hs200 = <0x5>; ti,otap-del-sel-hs400 = <0x0>; + ti,itap-del-sel-legacy = <0x10>; + ti,itap-del-sel-mmc-hs = <0xa>; + ti,itap-del-sel-ddr52 = <0x3>; ti,trm-icp = <0x8>; }; }; From 1e753dbb9ce824fc490c0c3757c4f39df72dc28b Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Wed, 23 Sep 2020 16:22:03 +0530 Subject: [PATCH 096/271] mmc: sdhci_am654: Fix hard coded otap delay array size Change hard coded array size value to depend on struct timing_data array size. Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20200923105206.7988-4-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 44 +++++++++++++++++----------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index a4c6d9d80e88..9f3347bc3757 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -85,28 +85,6 @@ static struct regmap_config sdhci_am654_regmap_config = { .fast_io = true, }; -struct sdhci_am654_data { - struct regmap *base; - bool legacy_otapdly; - int otap_del_sel[11]; - int clkbuf_sel; - int trm_icp; - int drv_strength; - bool dll_on; - int strb_sel; - u32 flags; -}; - -struct sdhci_am654_driver_data { - const struct sdhci_pltfm_data *pdata; - u32 flags; -#define IOMUX_PRESENT (1 << 0) -#define FREQSEL_2_BIT (1 << 1) -#define STRBSEL_4_BIT (1 << 2) -#define DLL_PRESENT (1 << 3) -#define DLL_CALIB (1 << 4) -}; - struct timing_data { const char *binding; u32 capability; @@ -127,6 +105,28 @@ static const struct timing_data td[] = { [MMC_TIMING_MMC_HS400] = {"ti,otap-del-sel-hs400", MMC_CAP2_HS400}, }; +struct sdhci_am654_data { + struct regmap *base; + bool legacy_otapdly; + int otap_del_sel[ARRAY_SIZE(td)]; + int clkbuf_sel; + int trm_icp; + int drv_strength; + bool dll_on; + int strb_sel; + u32 flags; +}; + +struct sdhci_am654_driver_data { + const struct sdhci_pltfm_data *pdata; + u32 flags; +#define IOMUX_PRESENT (1 << 0) +#define FREQSEL_2_BIT (1 << 1) +#define STRBSEL_4_BIT (1 << 2) +#define DLL_PRESENT (1 << 3) +#define DLL_CALIB (1 << 4) +}; + static void sdhci_am654_setup_dll(struct sdhci_host *host, unsigned int clock) { struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); From a0a62497f6aac0f9ba25e7befb334046c98fdf92 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Wed, 23 Sep 2020 16:22:04 +0530 Subject: [PATCH 097/271] mmc: sdhci_am654: Add support for input tap delay DLL need only be enabled for speed modes and clock frequencies at or above 50 MHz. For speed modes that don't enable the DLL, we need to configure a static input delay value. This involves reading an optional itap-del-sel-* value from the device tree and configuring it for the appropriate speed mode. With this addition, make sure that DLL is always switched off at the beginning of the set_clock() call to simplify configuration. This also removes the need for the dll_on member in struct sdhci_am654_data. Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20200923105206.7988-5-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 114 ++++++++++++++++++++++++--------- 1 file changed, 84 insertions(+), 30 deletions(-) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 9f3347bc3757..1213b711e60a 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -66,6 +66,14 @@ #define RETRIM_MASK BIT(RETRIM_SHIFT) #define SELDLYTXCLK_SHIFT 17 #define SELDLYTXCLK_MASK BIT(SELDLYTXCLK_SHIFT) +#define SELDLYRXCLK_SHIFT 16 +#define SELDLYRXCLK_MASK BIT(SELDLYRXCLK_SHIFT) +#define ITAPDLYSEL_SHIFT 0 +#define ITAPDLYSEL_MASK GENMASK(4, 0) +#define ITAPDLYENA_SHIFT 8 +#define ITAPDLYENA_MASK BIT(ITAPDLYENA_SHIFT) +#define ITAPCHGWIN_SHIFT 9 +#define ITAPCHGWIN_MASK BIT(ITAPCHGWIN_SHIFT) #define DRIVER_STRENGTH_50_OHM 0x0 #define DRIVER_STRENGTH_33_OHM 0x1 @@ -73,7 +81,7 @@ #define DRIVER_STRENGTH_100_OHM 0x3 #define DRIVER_STRENGTH_40_OHM 0x4 -#define CLOCK_TOO_SLOW_HZ 400000 +#define CLOCK_TOO_SLOW_HZ 50000000 /* Command Queue Host Controller Interface Base address */ #define SDHCI_AM654_CQE_BASE_ADDR 0x200 @@ -86,33 +94,55 @@ static struct regmap_config sdhci_am654_regmap_config = { }; struct timing_data { - const char *binding; + const char *otap_binding; + const char *itap_binding; u32 capability; }; static const struct timing_data td[] = { - [MMC_TIMING_LEGACY] = {"ti,otap-del-sel-legacy", 0}, - [MMC_TIMING_MMC_HS] = {"ti,otap-del-sel-mmc-hs", MMC_CAP_MMC_HIGHSPEED}, - [MMC_TIMING_SD_HS] = {"ti,otap-del-sel-sd-hs", MMC_CAP_SD_HIGHSPEED}, - [MMC_TIMING_UHS_SDR12] = {"ti,otap-del-sel-sdr12", MMC_CAP_UHS_SDR12}, - [MMC_TIMING_UHS_SDR25] = {"ti,otap-del-sel-sdr25", MMC_CAP_UHS_SDR25}, - [MMC_TIMING_UHS_SDR50] = {"ti,otap-del-sel-sdr50", MMC_CAP_UHS_SDR50}, - [MMC_TIMING_UHS_SDR104] = {"ti,otap-del-sel-sdr104", + [MMC_TIMING_LEGACY] = {"ti,otap-del-sel-legacy", + "ti,itap-del-sel-legacy", + 0}, + [MMC_TIMING_MMC_HS] = {"ti,otap-del-sel-mmc-hs", + "ti,itap-del-sel-mmc-hs", + MMC_CAP_MMC_HIGHSPEED}, + [MMC_TIMING_SD_HS] = {"ti,otap-del-sel-sd-hs", + "ti,itap-del-sel-sd-hs", + MMC_CAP_SD_HIGHSPEED}, + [MMC_TIMING_UHS_SDR12] = {"ti,otap-del-sel-sdr12", + "ti,itap-del-sel-sdr12", + MMC_CAP_UHS_SDR12}, + [MMC_TIMING_UHS_SDR25] = {"ti,otap-del-sel-sdr25", + "ti,itap-del-sel-sdr25", + MMC_CAP_UHS_SDR25}, + [MMC_TIMING_UHS_SDR50] = {"ti,otap-del-sel-sdr50", + NULL, + MMC_CAP_UHS_SDR50}, + [MMC_TIMING_UHS_SDR104] = {"ti,otap-del-sel-sdr104", + NULL, MMC_CAP_UHS_SDR104}, - [MMC_TIMING_UHS_DDR50] = {"ti,otap-del-sel-ddr50", MMC_CAP_UHS_DDR50}, - [MMC_TIMING_MMC_DDR52] = {"ti,otap-del-sel-ddr52", MMC_CAP_DDR}, - [MMC_TIMING_MMC_HS200] = {"ti,otap-del-sel-hs200", MMC_CAP2_HS200}, - [MMC_TIMING_MMC_HS400] = {"ti,otap-del-sel-hs400", MMC_CAP2_HS400}, + [MMC_TIMING_UHS_DDR50] = {"ti,otap-del-sel-ddr50", + NULL, + MMC_CAP_UHS_DDR50}, + [MMC_TIMING_MMC_DDR52] = {"ti,otap-del-sel-ddr52", + "ti,itap-del-sel-ddr52", + MMC_CAP_DDR}, + [MMC_TIMING_MMC_HS200] = {"ti,otap-del-sel-hs200", + NULL, + MMC_CAP2_HS200}, + [MMC_TIMING_MMC_HS400] = {"ti,otap-del-sel-hs400", + NULL, + MMC_CAP2_HS400}, }; struct sdhci_am654_data { struct regmap *base; bool legacy_otapdly; int otap_del_sel[ARRAY_SIZE(td)]; + int itap_del_sel[ARRAY_SIZE(td)]; int clkbuf_sel; int trm_icp; int drv_strength; - bool dll_on; int strb_sel; u32 flags; }; @@ -135,6 +165,10 @@ static void sdhci_am654_setup_dll(struct sdhci_host *host, unsigned int clock) u32 mask, val; int ret; + /* Disable delay chain mode */ + regmap_update_bits(sdhci_am654->base, PHY_CTRL5, + SELDLYTXCLK_MASK | SELDLYRXCLK_MASK, 0); + if (sdhci_am654->flags & FREQSEL_2_BIT) { switch (clock) { case 200000000: @@ -189,8 +223,32 @@ static void sdhci_am654_setup_dll(struct sdhci_host *host, unsigned int clock) dev_err(mmc_dev(host->mmc), "DLL failed to relock\n"); return; } +} - sdhci_am654->dll_on = true; +static void sdhci_am654_write_itapdly(struct sdhci_am654_data *sdhci_am654, + u32 itapdly) +{ + /* Set ITAPCHGWIN before writing to ITAPDLY */ + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPCHGWIN_MASK, + 1 << ITAPCHGWIN_SHIFT); + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPDLYSEL_MASK, + itapdly << ITAPDLYSEL_SHIFT); + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPCHGWIN_MASK, 0); +} + +static void sdhci_am654_setup_delay_chain(struct sdhci_am654_data *sdhci_am654, + unsigned char timing) +{ + u32 mask, val; + + regmap_update_bits(sdhci_am654->base, PHY_CTRL1, ENDLL_MASK, 0); + + val = 1 << SELDLYTXCLK_SHIFT | 1 << SELDLYRXCLK_SHIFT; + mask = SELDLYTXCLK_MASK | SELDLYRXCLK_MASK; + regmap_update_bits(sdhci_am654->base, PHY_CTRL5, mask, val); + + sdhci_am654_write_itapdly(sdhci_am654, + sdhci_am654->itap_del_sel[timing]); } static void sdhci_am654_set_clock(struct sdhci_host *host, unsigned int clock) @@ -202,11 +260,7 @@ static void sdhci_am654_set_clock(struct sdhci_host *host, unsigned int clock) u32 otap_del_ena; u32 mask, val; - if (sdhci_am654->dll_on) { - regmap_update_bits(sdhci_am654->base, PHY_CTRL1, ENDLL_MASK, 0); - - sdhci_am654->dll_on = false; - } + regmap_update_bits(sdhci_am654->base, PHY_CTRL1, ENDLL_MASK, 0); sdhci_set_clock(host, clock); @@ -234,14 +288,10 @@ static void sdhci_am654_set_clock(struct sdhci_host *host, unsigned int clock) regmap_update_bits(sdhci_am654->base, PHY_CTRL4, mask, val); - if (timing > MMC_TIMING_UHS_SDR25 && clock > CLOCK_TOO_SLOW_HZ) { - regmap_update_bits(sdhci_am654->base, PHY_CTRL5, - SELDLYTXCLK_MASK, 0); + if (timing > MMC_TIMING_UHS_SDR25 && clock >= CLOCK_TOO_SLOW_HZ) sdhci_am654_setup_dll(host, clock); - } else { - regmap_update_bits(sdhci_am654->base, PHY_CTRL5, - SELDLYTXCLK_MASK, 1 << SELDLYTXCLK_SHIFT); - } + else + sdhci_am654_setup_delay_chain(sdhci_am654, timing); regmap_update_bits(sdhci_am654->base, PHY_CTRL5, CLKBUFSEL_MASK, sdhci_am654->clkbuf_sel); @@ -469,7 +519,7 @@ static int sdhci_am654_get_otap_delay(struct sdhci_host *host, int i; int ret; - ret = device_property_read_u32(dev, td[MMC_TIMING_LEGACY].binding, + ret = device_property_read_u32(dev, td[MMC_TIMING_LEGACY].otap_binding, &sdhci_am654->otap_del_sel[MMC_TIMING_LEGACY]); if (ret) { /* @@ -492,11 +542,11 @@ static int sdhci_am654_get_otap_delay(struct sdhci_host *host, for (i = MMC_TIMING_MMC_HS; i <= MMC_TIMING_MMC_HS400; i++) { - ret = device_property_read_u32(dev, td[i].binding, + ret = device_property_read_u32(dev, td[i].otap_binding, &sdhci_am654->otap_del_sel[i]); if (ret) { dev_dbg(dev, "Couldn't find %s\n", - td[i].binding); + td[i].otap_binding); /* * Remove the corresponding capability * if an otap-del-sel value is not found @@ -506,6 +556,10 @@ static int sdhci_am654_get_otap_delay(struct sdhci_host *host, else host->mmc->caps2 &= ~td[i].capability; } + + if (td[i].itap_binding) + device_property_read_u32(dev, td[i].itap_binding, + &sdhci_am654->itap_del_sel[i]); } return 0; From 13ebeae68ac9830fb8e6e8dfc84f55dba5aab58b Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Wed, 23 Sep 2020 16:22:05 +0530 Subject: [PATCH 098/271] mmc: sdhci_am654: Add support for software tuning With the new SW tuning App note[1], a custom tuning algorithm is required for eMMC HS200, HS400 and SD card UHS modes. The algorithm involves running through the 32 possible input tap delay values and sending the appropriate tuning command (CMD19/21) for each of them to get a fail or pass result for each of the values. Typically, the range will have a small contiguous failing window. Considering the tuning range as a circular buffer, the algorithm then sets a final tuned value directly opposite to the failing window. [1] https://www.ti.com/lit/pdf/spract9 Signed-off-by: Faiz Abbas Reviewed-by: Kishon Vijay Abraham I Link: https://lore.kernel.org/r/20200923105206.7988-6-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 41 ++++++++++++++++++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 1213b711e60a..5af7638ad606 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -396,7 +396,46 @@ static u32 sdhci_am654_cqhci_irq(struct sdhci_host *host, u32 intmask) return 0; } +#define ITAP_MAX 32 +static int sdhci_am654_platform_execute_tuning(struct sdhci_host *host, + u32 opcode) +{ + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct sdhci_am654_data *sdhci_am654 = sdhci_pltfm_priv(pltfm_host); + int cur_val, prev_val = 1, fail_len = 0, pass_window = 0, pass_len; + u32 itap; + + /* Enable ITAPDLY */ + regmap_update_bits(sdhci_am654->base, PHY_CTRL4, ITAPDLYENA_MASK, + 1 << ITAPDLYENA_SHIFT); + + for (itap = 0; itap < ITAP_MAX; itap++) { + sdhci_am654_write_itapdly(sdhci_am654, itap); + + cur_val = !mmc_send_tuning(host->mmc, opcode, NULL); + if (cur_val && !prev_val) + pass_window = itap; + + if (!cur_val) + fail_len++; + + prev_val = cur_val; + } + /* + * Having determined the length of the failing window and start of + * the passing window calculate the length of the passing window and + * set the final value halfway through it considering the range as a + * circular buffer + */ + pass_len = ITAP_MAX - fail_len; + itap = (pass_window + (pass_len >> 1)) % ITAP_MAX; + sdhci_am654_write_itapdly(sdhci_am654, itap); + + return 0; +} + static struct sdhci_ops sdhci_am654_ops = { + .platform_execute_tuning = sdhci_am654_platform_execute_tuning, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, .set_uhs_signaling = sdhci_set_uhs_signaling, @@ -426,6 +465,7 @@ static const struct sdhci_am654_driver_data sdhci_am654_drvdata = { }; static struct sdhci_ops sdhci_j721e_8bit_ops = { + .platform_execute_tuning = sdhci_am654_platform_execute_tuning, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, .set_uhs_signaling = sdhci_set_uhs_signaling, @@ -449,6 +489,7 @@ static const struct sdhci_am654_driver_data sdhci_j721e_8bit_drvdata = { }; static struct sdhci_ops sdhci_j721e_4bit_ops = { + .platform_execute_tuning = sdhci_am654_platform_execute_tuning, .get_max_clock = sdhci_pltfm_clk_get_max_clock, .get_timeout_clock = sdhci_pltfm_clk_get_max_clock, .set_uhs_signaling = sdhci_set_uhs_signaling, From 764384d0640376049d1a5c8789ce1c4564c76969 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Wed, 23 Sep 2020 16:22:06 +0530 Subject: [PATCH 099/271] mmc: sdhci_am654: Enable tuning for SDR50 According to the SW tuning App note[1], tuning is required for all UHS speed modes. Tuning for SDR50 is not enabled in Capabilities by default so enable it from the CTL_CFG registers. [1] https://www.ti.com/lit/pdf/spract9 Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20200923105206.7988-7-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 5af7638ad606..2bce962bf7e4 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -19,9 +19,11 @@ /* CTL_CFG Registers */ #define CTL_CFG_2 0x14 +#define CTL_CFG_3 0x18 #define SLOTTYPE_MASK GENMASK(31, 30) #define SLOTTYPE_EMBEDDED BIT(30) +#define TUNINGFORSDR50_MASK BIT(13) /* PHY Registers */ #define PHY_CTRL1 0x100 @@ -646,6 +648,10 @@ static int sdhci_am654_init(struct sdhci_host *host) regmap_update_bits(sdhci_am654->base, CTL_CFG_2, SLOTTYPE_MASK, ctl_cfg_2); + /* Enable tuning for SDR50 */ + regmap_update_bits(sdhci_am654->base, CTL_CFG_3, TUNINGFORSDR50_MASK, + TUNINGFORSDR50_MASK); + ret = sdhci_setup_host(host); if (ret) return ret; From 0461e0db941f8f49dcfd0576c4449f2e5beda2f6 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 24 Sep 2020 10:31:22 -0400 Subject: [PATCH 100/271] fs: dlm: remove lock dependency warning During my experiments to make dlm robust against tcpkill application I was able to run sometimes in a circular lock dependency warning between clusters_root.subsys.su_mutex and con->sock_mutex. We don't need to held the sock_mutex when getting the mark value which held the clusters_root.subsys.su_mutex. This patch moves the specific handling just before the sock_mutex will be held. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 1bf1808bfa6b..24f5e55313d8 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -971,6 +971,10 @@ static void sctp_connect_to_sock(struct connection *con) return; } + result = dlm_comm_mark(con->nodeid, &mark); + if (result < 0) + return; + mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -995,11 +999,6 @@ static void sctp_connect_to_sock(struct connection *con) if (result < 0) goto socket_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto bind_err; - sock_set_mark(sock->sk, mark); con->rx_action = receive_from_sock; @@ -1072,6 +1071,10 @@ static void tcp_connect_to_sock(struct connection *con) return; } + result = dlm_comm_mark(con->nodeid, &mark); + if (result < 0) + return; + mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1086,11 +1089,6 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - /* set skb mark */ - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - goto out_err; - sock_set_mark(sock->sk, mark); memset(&saddr, 0, sizeof(saddr)); From 3f78cd7d2449a07904b3a23751758cbdeaaa20f3 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 24 Sep 2020 10:31:23 -0400 Subject: [PATCH 101/271] fs: dlm: fix mark per nodeid setting This patch fixes to set per nodeid mark configuration for accepted sockets as well. Before this patch only the listen socket mark value was used for all accepted connections. This patch will ensure that the cluster mark attribute value will be always used for all sockets, if a per nodeid mark value is specified dlm will use this value for the specific node. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 16 ++++++++++------ fs/dlm/config.h | 2 +- fs/dlm/lowcomms.c | 12 ++++++------ 3 files changed, 17 insertions(+), 13 deletions(-) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index f33a7e4ae917..ca4a9795afbe 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -860,18 +860,22 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -int dlm_comm_mark(int nodeid, unsigned int *mark) +void dlm_comm_mark(int nodeid, unsigned int *mark) { struct dlm_comm *cm; cm = get_comm(nodeid); - if (!cm) - return -ENOENT; + if (!cm) { + *mark = dlm_config.ci_mark; + return; + } + + if (cm->mark) + *mark = cm->mark; + else + *mark = dlm_config.ci_mark; - *mark = cm->mark; put_comm(cm); - - return 0; } int dlm_our_nodeid(void) diff --git a/fs/dlm/config.h b/fs/dlm/config.h index f62996cad561..3b284ae9aeeb 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -46,7 +46,7 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -int dlm_comm_mark(int nodeid, unsigned int *mark); +void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 24f5e55313d8..96f84541867c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -790,6 +790,7 @@ static int accept_from_sock(struct connection *con) int nodeid; struct connection *newcon; struct connection *addcon; + unsigned int mark; if (!dlm_allow_conn) { return -1; @@ -826,6 +827,9 @@ static int accept_from_sock(struct connection *con) return -1; } + dlm_comm_mark(nodeid, &mark); + sock_set_mark(newsock->sk, mark); + log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -971,9 +975,7 @@ static void sctp_connect_to_sock(struct connection *con) return; } - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - return; + dlm_comm_mark(con->nodeid, &mark); mutex_lock(&con->sock_mutex); @@ -1071,9 +1073,7 @@ static void tcp_connect_to_sock(struct connection *con) return; } - result = dlm_comm_mark(con->nodeid, &mark); - if (result < 0) - return; + dlm_comm_mark(con->nodeid, &mark); mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) From e1a0ec30a571f176e9b324daba4c0e3f200fe882 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 24 Sep 2020 10:31:24 -0400 Subject: [PATCH 102/271] fs: dlm: handle range check as callback This patch adds a callback to CLUSTER_ATTR macro to allow individual callbacks for attributes which might have a more complex attribute range checking just than non zero. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 41 +++++++++++++++++++++++------------------ 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index ca4a9795afbe..e03b409a4df0 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, int check_zero, + int *info_field, bool (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,7 +137,7 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_zero && !x) + if (check_cb && check_cb(x)) return -EINVAL; *cl_field = x; @@ -146,13 +146,13 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, return len; } -#define CLUSTER_ATTR(name, check_zero) \ +#define CLUSTER_ATTR(name, check_cb) \ static ssize_t cluster_##name##_store(struct config_item *item, \ const char *buf, size_t len) \ { \ struct dlm_cluster *cl = config_item_to_cluster(item); \ return cluster_set(cl, &cl->cl_##name, &dlm_config.ci_##name, \ - check_zero, buf, len); \ + check_cb, buf, len); \ } \ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ { \ @@ -161,20 +161,25 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -CLUSTER_ATTR(tcp_port, 1); -CLUSTER_ATTR(buffer_size, 1); -CLUSTER_ATTR(rsbtbl_size, 1); -CLUSTER_ATTR(recover_timer, 1); -CLUSTER_ATTR(toss_secs, 1); -CLUSTER_ATTR(scan_secs, 1); -CLUSTER_ATTR(log_debug, 0); -CLUSTER_ATTR(log_info, 0); -CLUSTER_ATTR(protocol, 0); -CLUSTER_ATTR(mark, 0); -CLUSTER_ATTR(timewarn_cs, 1); -CLUSTER_ATTR(waitwarn_us, 0); -CLUSTER_ATTR(new_rsb_count, 0); -CLUSTER_ATTR(recover_callbacks, 0); +static bool dlm_check_zero(unsigned int x) +{ + return !x; +} + +CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(buffer_size, dlm_check_zero); +CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); +CLUSTER_ATTR(recover_timer, dlm_check_zero); +CLUSTER_ATTR(toss_secs, dlm_check_zero); +CLUSTER_ATTR(scan_secs, dlm_check_zero); +CLUSTER_ATTR(log_debug, NULL); +CLUSTER_ATTR(log_info, NULL); +CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(mark, NULL); +CLUSTER_ATTR(timewarn_cs, dlm_check_zero); +CLUSTER_ATTR(waitwarn_us, NULL); +CLUSTER_ATTR(new_rsb_count, NULL); +CLUSTER_ATTR(recover_callbacks, NULL); static struct configfs_attribute *cluster_attrs[] = { [CLUSTER_ATTR_TCP_PORT] = &cluster_attr_tcp_port, From 4e192ee68e5af301470a925b76700d788db35d96 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 24 Sep 2020 10:31:25 -0400 Subject: [PATCH 103/271] fs: dlm: disallow buffer size below default I observed that the upper layer will not send messages above this value. As conclusion the application receive buffer should not below that value, otherwise we are not capable to deliver the dlm message to the upper layer. This patch forbids to set the receive buffer below the maximum possible dlm message size. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index e03b409a4df0..a4bed304a843 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -166,8 +166,14 @@ static bool dlm_check_zero(unsigned int x) return !x; } +#define DEFAULT_BUFFER_SIZE 4096 +static bool dlm_check_buffer_size(unsigned int x) +{ + return (x < DEFAULT_BUFFER_SIZE); +} + CLUSTER_ATTR(tcp_port, dlm_check_zero); -CLUSTER_ATTR(buffer_size, dlm_check_zero); +CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); CLUSTER_ATTR(toss_secs, dlm_check_zero); @@ -901,7 +907,6 @@ int dlm_our_addr(struct sockaddr_storage *addr, int num) /* Config file defaults */ #define DEFAULT_TCP_PORT 21064 -#define DEFAULT_BUFFER_SIZE 4096 #define DEFAULT_RSBTBL_SIZE 1024 #define DEFAULT_RECOVER_TIMER 5 #define DEFAULT_TOSS_SECS 10 From 4798cbbfbd00c498339bdcf4cc2429f53eb374ec Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Thu, 24 Sep 2020 10:31:26 -0400 Subject: [PATCH 104/271] fs: dlm: rework receive handling This patch reworks the current receive handling of dlm. As I tried to change the send handling to fix reorder issues I took a look into the receive handling and simplified it, it works as the following: Each connection has a preallocated receive buffer with a minimum length of 4096. On receive, the upper layer protocol will process all dlm message until there is not enough data anymore. If there exists "leftover" data at the end of the receive buffer because the dlm message wasn't fully received it will be copied to the begin of the preallocated receive buffer. Next receive more data will be appended to the previous "leftover" data and processing will begin again. This will remove a lot of code of the current mechanism. Inside the processing functionality we will ensure with a memmove() that the dlm message should be memory aligned. To have a dlm message always started at the beginning of the buffer will reduce some amount of memmove() calls because src and dest pointers are the same. The cluster attribute "buffer_size" becomes a new meaning, it's now the size of application layer receive buffer size. If this is changed during runtime the receive buffer will be reallocated. It's important that the receive buffer size has at minimum the size of the maximum possible dlm message size otherwise the received message cannot be placed inside the receive buffer size. Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/config.c | 1 - fs/dlm/config.h | 2 + fs/dlm/lowcomms.c | 179 ++++++++++++++++++++++------------------------ fs/dlm/midcomms.c | 132 +++++++++++++--------------------- fs/dlm/midcomms.h | 3 +- 5 files changed, 139 insertions(+), 178 deletions(-) diff --git a/fs/dlm/config.c b/fs/dlm/config.c index a4bed304a843..49c5f9407098 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -166,7 +166,6 @@ static bool dlm_check_zero(unsigned int x) return !x; } -#define DEFAULT_BUFFER_SIZE 4096 static bool dlm_check_buffer_size(unsigned int x) { return (x < DEFAULT_BUFFER_SIZE); diff --git a/fs/dlm/config.h b/fs/dlm/config.h index 3b284ae9aeeb..c210250a2581 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -12,6 +12,8 @@ #ifndef __CONFIG_DOT_H__ #define __CONFIG_DOT_H__ +#define DEFAULT_BUFFER_SIZE 4096 + struct dlm_config_node { int nodeid; int weight; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 96f84541867c..b7b7360be609 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -65,40 +65,6 @@ #define MAX_SEND_MSG_COUNT 25 #define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(10000) -struct cbuf { - unsigned int base; - unsigned int len; - unsigned int mask; -}; - -static void cbuf_add(struct cbuf *cb, int n) -{ - cb->len += n; -} - -static int cbuf_data(struct cbuf *cb) -{ - return ((cb->base + cb->len) & cb->mask); -} - -static void cbuf_init(struct cbuf *cb, int size) -{ - cb->base = cb->len = 0; - cb->mask = size-1; -} - -static void cbuf_eat(struct cbuf *cb, int n) -{ - cb->len -= n; - cb->base += n; - cb->base &= cb->mask; -} - -static bool cbuf_empty(struct cbuf *cb) -{ - return cb->len == 0; -} - struct connection { struct socket *sock; /* NULL if not connected */ uint32_t nodeid; /* So we know who we are in the list */ @@ -117,8 +83,6 @@ struct connection { int (*rx_action) (struct connection *); /* What to do when active */ void (*connect_action) (struct connection *); /* What to do to connect */ void (*shutdown_action)(struct connection *con); /* What to do to shutdown */ - struct page *rx_page; - struct cbuf cb; int retries; #define MAX_CONNECT_RETRIES 3 struct hlist_node list; @@ -126,6 +90,9 @@ struct connection { struct work_struct rwork; /* Receive workqueue */ struct work_struct swork; /* Send workqueue */ wait_queue_head_t shutdown_wait; /* wait for graceful shutdown */ + unsigned char *rx_buf; + int rx_buflen; + int rx_leftover; struct rcu_head rcu; }; #define sock2con(x) ((struct connection *)(x)->sk_user_data) @@ -219,6 +186,13 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) if (!con) return NULL; + con->rx_buflen = dlm_config.ci_buffer_size; + con->rx_buf = kmalloc(con->rx_buflen, GFP_NOFS); + if (!con->rx_buf) { + kfree(con); + return NULL; + } + con->nodeid = nodeid; mutex_init(&con->sock_mutex); INIT_LIST_HEAD(&con->writequeue); @@ -613,11 +587,8 @@ static void close_connection(struct connection *con, bool and_other, /* Will only re-enter once. */ close_connection(con->othercon, false, true, true); } - if (con->rx_page) { - __free_page(con->rx_page); - con->rx_page = NULL; - } + con->rx_leftover = 0; con->retries = 0; mutex_unlock(&con->sock_mutex); clear_bit(CF_CLOSING, &con->flags); @@ -671,16 +642,33 @@ static void dlm_tcp_shutdown(struct connection *con) shutdown_connection(con); } +static int con_realloc_receive_buf(struct connection *con, int newlen) +{ + unsigned char *newbuf; + + newbuf = kmalloc(newlen, GFP_NOFS); + if (!newbuf) + return -ENOMEM; + + /* copy any leftover from last receive */ + if (con->rx_leftover) + memmove(newbuf, con->rx_buf, con->rx_leftover); + + /* swap to new buffer space */ + kfree(con->rx_buf); + con->rx_buflen = newlen; + con->rx_buf = newbuf; + + return 0; +} + /* Data received from remote end */ static int receive_from_sock(struct connection *con) { - int ret = 0; - struct msghdr msg = {}; - struct kvec iov[2]; - unsigned len; - int r; int call_again_soon = 0; - int nvec; + struct msghdr msg; + struct kvec iov; + int ret, buflen; mutex_lock(&con->sock_mutex); @@ -688,71 +676,55 @@ static int receive_from_sock(struct connection *con) ret = -EAGAIN; goto out_close; } + if (con->nodeid == 0) { ret = -EINVAL; goto out_close; } - if (con->rx_page == NULL) { - /* - * This doesn't need to be atomic, but I think it should - * improve performance if it is. - */ - con->rx_page = alloc_page(GFP_ATOMIC); - if (con->rx_page == NULL) + /* realloc if we get new buffer size to read out */ + buflen = dlm_config.ci_buffer_size; + if (con->rx_buflen != buflen && con->rx_leftover <= buflen) { + ret = con_realloc_receive_buf(con, buflen); + if (ret < 0) goto out_resched; - cbuf_init(&con->cb, PAGE_SIZE); } - /* - * iov[0] is the bit of the circular buffer between the current end - * point (cb.base + cb.len) and the end of the buffer. + /* calculate new buffer parameter regarding last receive and + * possible leftover bytes */ - iov[0].iov_len = con->cb.base - cbuf_data(&con->cb); - iov[0].iov_base = page_address(con->rx_page) + cbuf_data(&con->cb); - iov[1].iov_len = 0; - nvec = 1; + iov.iov_base = con->rx_buf + con->rx_leftover; + iov.iov_len = con->rx_buflen - con->rx_leftover; - /* - * iov[1] is the bit of the circular buffer between the start of the - * buffer and the start of the currently used section (cb.base) - */ - if (cbuf_data(&con->cb) >= con->cb.base) { - iov[0].iov_len = PAGE_SIZE - cbuf_data(&con->cb); - iov[1].iov_len = con->cb.base; - iov[1].iov_base = page_address(con->rx_page); - nvec = 2; - } - len = iov[0].iov_len + iov[1].iov_len; - iov_iter_kvec(&msg.msg_iter, READ, iov, nvec, len); - - r = ret = sock_recvmsg(con->sock, &msg, MSG_DONTWAIT | MSG_NOSIGNAL); + memset(&msg, 0, sizeof(msg)); + msg.msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL; + ret = kernel_recvmsg(con->sock, &msg, &iov, 1, iov.iov_len, + msg.msg_flags); if (ret <= 0) goto out_close; - else if (ret == len) + else if (ret == iov.iov_len) call_again_soon = 1; - cbuf_add(&con->cb, ret); - ret = dlm_process_incoming_buffer(con->nodeid, - page_address(con->rx_page), - con->cb.base, con->cb.len, - PAGE_SIZE); - if (ret < 0) { - log_print("lowcomms err %d: addr=%p, base=%u, len=%u, read=%d", - ret, page_address(con->rx_page), con->cb.base, - con->cb.len, r); - cbuf_eat(&con->cb, r); - } else { - cbuf_eat(&con->cb, ret); - } + /* new buflen according readed bytes and leftover from last receive */ + buflen = ret + con->rx_leftover; + ret = dlm_process_incoming_buffer(con->nodeid, con->rx_buf, buflen); + if (ret < 0) + goto out_close; - if (cbuf_empty(&con->cb) && !call_again_soon) { - __free_page(con->rx_page); - con->rx_page = NULL; + /* calculate leftover bytes from process and put it into begin of + * the receive buffer, so next receive we have the full message + * at the start address of the receive buffer. + */ + con->rx_leftover = buflen - ret; + if (con->rx_leftover) { + memmove(con->rx_buf, con->rx_buf + ret, + con->rx_leftover); + call_again_soon = true; } if (call_again_soon) goto out_resched; + mutex_unlock(&con->sock_mutex); return 0; @@ -854,6 +826,17 @@ static int accept_from_sock(struct connection *con) result = -ENOMEM; goto accept_err; } + + othercon->rx_buflen = dlm_config.ci_buffer_size; + othercon->rx_buf = kmalloc(othercon->rx_buflen, GFP_NOFS); + if (!othercon->rx_buf) { + mutex_unlock(&newcon->sock_mutex); + kfree(othercon); + log_print("failed to allocate incoming socket receive buffer"); + result = -ENOMEM; + goto accept_err; + } + othercon->nodeid = nodeid; othercon->rx_action = receive_from_sock; mutex_init(&othercon->sock_mutex); @@ -1603,6 +1586,14 @@ static void shutdown_conn(struct connection *con) con->shutdown_action(con); } +static void connection_release(struct rcu_head *rcu) +{ + struct connection *con = container_of(rcu, struct connection, rcu); + + kfree(con->rx_buf); + kfree(con); +} + static void free_conn(struct connection *con) { close_connection(con, true, true, true); @@ -1611,10 +1602,10 @@ static void free_conn(struct connection *con) spin_unlock(&connections_lock); if (con->othercon) { clean_one_writequeue(con->othercon); - kfree_rcu(con->othercon, rcu); + call_rcu(&con->othercon->rcu, connection_release); } clean_one_writequeue(con); - kfree_rcu(con, rcu); + call_rcu(&con->rcu, connection_release); } static void work_flush(void) diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index 921322d133e3..fde3a6afe4be 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,114 +22,84 @@ * into packets and sends them to the comms layer. */ +#include + #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" #include "lock.h" #include "midcomms.h" - -static void copy_from_cb(void *dst, const void *base, unsigned offset, - unsigned len, unsigned limit) -{ - unsigned copy = len; - - if ((copy + offset) > limit) - copy = limit - offset; - memcpy(dst, base + offset, copy); - len -= copy; - if (len) - memcpy(dst + copy, base, len); -} - /* * Called from the low-level comms layer to process a buffer of * commands. - * - * Only complete messages are processed here, any "spare" bytes from - * the end of a buffer are saved and tacked onto the front of the next - * message that comes in. I doubt this will happen very often but we - * need to be able to cope with it and I don't want the task to be waiting - * for packets to come in when there is useful work to be done. */ -int dlm_process_incoming_buffer(int nodeid, const void *base, - unsigned offset, unsigned len, unsigned limit) +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) { - union { - unsigned char __buf[DLM_INBUF_LEN]; - /* this is to force proper alignment on some arches */ - union dlm_packet p; - } __tmp; - union dlm_packet *p = &__tmp.p; - int ret = 0; - int err = 0; + const unsigned char *ptr = buf; + const struct dlm_header *hd; uint16_t msglen; - uint32_t lockspace; + int ret = 0; - while (len > sizeof(struct dlm_header)) { + while (len >= sizeof(struct dlm_header)) { + hd = (struct dlm_header *)ptr; - /* Copy just the header to check the total length. The - message may wrap around the end of the buffer back to the - start, so we need to use a temp buffer and copy_from_cb. */ - - copy_from_cb(p, base, offset, sizeof(struct dlm_header), - limit); - - msglen = le16_to_cpu(p->header.h_length); - lockspace = p->header.h_lockspace; - - err = -EINVAL; - if (msglen < sizeof(struct dlm_header)) - break; - if (p->header.h_cmd == DLM_MSG) { - if (msglen < sizeof(struct dlm_message)) - break; - } else { - if (msglen < sizeof(struct dlm_rcom)) - break; + /* no message should be more than this otherwise we + * cannot deliver this message to upper layers + */ + msglen = get_unaligned_le16(&hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE) { + log_print("received invalid length header: %u, will abort message parsing", + msglen); + return -EBADMSG; } - err = -E2BIG; - if (msglen > dlm_config.ci_buffer_size) { - log_print("message size %d from %d too big, buf len %d", - msglen, nodeid, len); - break; - } - err = 0; - - /* If only part of the full message is contained in this - buffer, then do nothing and wait for lowcomms to call - us again later with more data. We return 0 meaning - we've consumed none of the input buffer. */ + /* caller will take care that leftover + * will be parsed next call with more data + */ if (msglen > len) break; - /* Allocate a larger temp buffer if the full message won't fit - in the buffer on the stack (which should work for most - ordinary messages). */ + switch (hd->h_cmd) { + case DLM_MSG: + if (msglen < sizeof(struct dlm_message)) { + log_print("dlm msg too small: %u, will skip this message", + msglen); + goto skip; + } - if (msglen > sizeof(__tmp) && p == &__tmp.p) { - p = kmalloc(dlm_config.ci_buffer_size, GFP_NOFS); - if (p == NULL) - return ret; + break; + case DLM_RCOM: + if (msglen < sizeof(struct dlm_rcom)) { + log_print("dlm rcom msg too small: %u, will skip this message", + msglen); + goto skip; + } + + break; + default: + log_print("unsupported h_cmd received: %u, will skip this message", + hd->h_cmd); + goto skip; } - copy_from_cb(p, base, offset, msglen, limit); - - BUG_ON(lockspace != p->header.h_lockspace); + /* for aligned memory access, we just copy current message + * to begin of the buffer which contains already parsed buffer + * data and should provide align access for upper layers + * because the start address of the buffer has a aligned + * address. This memmove can be removed when the upperlayer + * is capable of unaligned memory access. + */ + memmove(buf, ptr, msglen); + dlm_receive_buffer((union dlm_packet *)buf, nodeid); +skip: ret += msglen; - offset += msglen; - offset &= (limit - 1); len -= msglen; - - dlm_receive_buffer(p, nodeid); + ptr += msglen; } - if (p != &__tmp.p) - kfree(p); - - return err ? err : ret; + return ret; } diff --git a/fs/dlm/midcomms.h b/fs/dlm/midcomms.h index 2e122e81c8d0..61e90a921849 100644 --- a/fs/dlm/midcomms.h +++ b/fs/dlm/midcomms.h @@ -12,8 +12,7 @@ #ifndef __MIDCOMMS_DOT_H__ #define __MIDCOMMS_DOT_H__ -int dlm_process_incoming_buffer(int nodeid, const void *base, unsigned offset, - unsigned len, unsigned limit); +int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int buflen); #endif /* __MIDCOMMS_DOT_H__ */ From 4f2b30fd9b4bd6e3620fe55786df7fc5f89ad526 Mon Sep 17 00:00:00 2001 From: Alexander Aring Date: Wed, 30 Sep 2020 18:37:29 -0400 Subject: [PATCH 105/271] fs: dlm: fix race in nodeid2con This patch fixes a race in nodeid2con in cases that we parallel running a lookup and both will create a connection structure for the same nodeid. It's a rare case to create a new connection structure to keep reader lockless we just do a lookup inside the protection area again and drop previous work if this race happens. Fixes: a47666eb763cc ("fs: dlm: make connection hash lockless") Signed-off-by: Alexander Aring Signed-off-by: David Teigland --- fs/dlm/lowcomms.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index b7b7360be609..79f56f16bc2c 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -175,7 +175,7 @@ static struct connection *__find_con(int nodeid) */ static struct connection *nodeid2con(int nodeid, gfp_t alloc) { - struct connection *con = NULL; + struct connection *con, *tmp; int r; con = __find_con(nodeid); @@ -213,6 +213,20 @@ static struct connection *nodeid2con(int nodeid, gfp_t alloc) r = nodeid_hash(nodeid); spin_lock(&connections_lock); + /* Because multiple workqueues/threads calls this function it can + * race on multiple cpu's. Instead of locking hot path __find_con() + * we just check in rare cases of recently added nodes again + * under protection of connections_lock. If this is the case we + * abort our connection creation and return the existing connection. + */ + tmp = __find_con(nodeid); + if (tmp) { + spin_unlock(&connections_lock); + kfree(con->rx_buf); + kfree(con); + return tmp; + } + hlist_add_head_rcu(&con->list, &connection_hash[r]); spin_unlock(&connections_lock); From f23cc3ba491af77395cea3f9d51204398729f26b Mon Sep 17 00:00:00 2001 From: Raul E Rangel Date: Mon, 28 Sep 2020 15:59:20 -0600 Subject: [PATCH 106/271] mmc: sdhci-acpi: AMDI0040: Set SDHCI_QUIRK2_PRESET_VALUE_BROKEN This change fixes HS400 tuning for devices with invalid presets. SDHCI presets are not currently used for eMMC HS/HS200/HS400, but are used for DDR52. The HS400 retuning sequence is: HS400->DDR52->HS->HS200->Perform Tuning->HS->HS400 This means that when HS400 tuning happens, we transition through DDR52 for a very brief period. This causes presets to be enabled unintentionally and stay enabled when transitioning back to HS200 or HS400. Some firmware has invalid presets, so we end up with driver strengths that can cause I/O problems. Fixes: 34597a3f60b1 ("mmc: sdhci-acpi: Add support for ACPI HID of AMD Controller with HS400") Signed-off-by: Raul E Rangel Acked-by: Adrian Hunter Cc: stable@vger.kernel.org Link: https://lore.kernel.org/r/20200928154718.1.Icc21d4b2f354e83e26e57e270dc952f5fe0b0a40@changeid Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-acpi.c | 37 +++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/drivers/mmc/host/sdhci-acpi.c b/drivers/mmc/host/sdhci-acpi.c index fc41d0451f20..54205e3be9e8 100644 --- a/drivers/mmc/host/sdhci-acpi.c +++ b/drivers/mmc/host/sdhci-acpi.c @@ -662,6 +662,43 @@ static int sdhci_acpi_emmc_amd_probe_slot(struct platform_device *pdev, (host->mmc->caps & MMC_CAP_1_8V_DDR)) host->mmc->caps2 = MMC_CAP2_HS400_1_8V; + /* + * There are two types of presets out in the wild: + * 1) Default/broken presets. + * These presets have two sets of problems: + * a) The clock divisor for SDR12, SDR25, and SDR50 is too small. + * This results in clock frequencies that are 2x higher than + * acceptable. i.e., SDR12 = 25 MHz, SDR25 = 50 MHz, SDR50 = + * 100 MHz.x + * b) The HS200 and HS400 driver strengths don't match. + * By default, the SDR104 preset register has a driver strength of + * A, but the (internal) HS400 preset register has a driver + * strength of B. As part of initializing HS400, HS200 tuning + * needs to be performed. Having different driver strengths + * between tuning and operation is wrong. It results in different + * rise/fall times that lead to incorrect sampling. + * 2) Firmware with properly initialized presets. + * These presets have proper clock divisors. i.e., SDR12 => 12MHz, + * SDR25 => 25 MHz, SDR50 => 50 MHz. Additionally the HS200 and + * HS400 preset driver strengths match. + * + * Enabling presets for HS400 doesn't work for the following reasons: + * 1) sdhci_set_ios has a hard coded list of timings that are used + * to determine if presets should be enabled. + * 2) sdhci_get_preset_value is using a non-standard register to + * read out HS400 presets. The AMD controller doesn't support this + * non-standard register. In fact, it doesn't expose the HS400 + * preset register anywhere in the SDHCI memory map. This results + * in reading a garbage value and using the wrong presets. + * + * Since HS400 and HS200 presets must be identical, we could + * instead use the the SDR104 preset register. + * + * If the above issues are resolved we could remove this quirk for + * firmware that that has valid presets (i.e., SDR12 <= 12 MHz). + */ + host->quirks2 |= SDHCI_QUIRK2_PRESET_VALUE_BROKEN; + host->mmc_host_ops.select_drive_strength = amd_select_drive_strength; host->mmc_host_ops.set_ios = amd_set_ios; host->mmc_host_ops.execute_tuning = amd_sdhci_execute_tuning; From 347f6be11de1940dc915596d17b0f94b03779c77 Mon Sep 17 00:00:00 2001 From: Ben Chuang Date: Mon, 5 Oct 2020 18:55:09 +0800 Subject: [PATCH 107/271] mmc: sdhci-pci-gli: Add CQHCI Support for GL9763E Add CQHCI initialization and implement CQHCI operations for GL9763E. Use bit19 of the register (0x888) to decide whether to disable command queuing. If the bit is set, the command queuing will be disabled. Signed-off-by: Ben Chuang Link: https://lore.kernel.org/r/20201005105509.11343-1-ben.chuanggli@gmail.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci-pci-gli.c | 150 ++++++++++++++++++++++++++++++- 1 file changed, 148 insertions(+), 2 deletions(-) diff --git a/drivers/mmc/host/sdhci-pci-gli.c b/drivers/mmc/host/sdhci-pci-gli.c index 5da2b06d84ae..9887485a4134 100644 --- a/drivers/mmc/host/sdhci-pci-gli.c +++ b/drivers/mmc/host/sdhci-pci-gli.c @@ -14,6 +14,7 @@ #include #include "sdhci.h" #include "sdhci-pci.h" +#include "cqhci.h" /* Genesys Logic extra registers */ #define SDHCI_GLI_9750_WT 0x800 @@ -81,9 +82,16 @@ #define GLI_9763E_VHS_REV_R 0x0 #define GLI_9763E_VHS_REV_M 0x1 #define GLI_9763E_VHS_REV_W 0x2 +#define PCIE_GLI_9763E_MB 0x888 +#define GLI_9763E_MB_CMDQ_OFF BIT(19) #define PCIE_GLI_9763E_SCR 0x8E0 #define GLI_9763E_SCR_AXI_REQ BIT(9) +#define SDHCI_GLI_9763E_CQE_BASE_ADDR 0x200 +#define GLI_9763E_CQE_TRNS_MODE (SDHCI_TRNS_MULTI | \ + SDHCI_TRNS_BLK_CNT_EN | \ + SDHCI_TRNS_DMA) + #define PCI_GLI_9755_WT 0x800 #define PCI_GLI_9755_WT_EN BIT(0) #define GLI_9755_WT_EN_ON 0x1 @@ -578,6 +586,30 @@ static int sdhci_pci_gli_resume(struct sdhci_pci_chip *chip) return sdhci_pci_resume_host(chip); } + +static int sdhci_cqhci_gli_resume(struct sdhci_pci_chip *chip) +{ + struct sdhci_pci_slot *slot = chip->slots[0]; + int ret; + + ret = sdhci_pci_gli_resume(chip); + if (ret) + return ret; + + return cqhci_resume(slot->host->mmc); +} + +static int sdhci_cqhci_gli_suspend(struct sdhci_pci_chip *chip) +{ + struct sdhci_pci_slot *slot = chip->slots[0]; + int ret; + + ret = cqhci_suspend(slot->host->mmc); + if (ret) + return ret; + + return sdhci_suspend_host(slot->host); +} #endif static void gl9763e_hs400_enhanced_strobe(struct mmc_host *mmc, @@ -614,6 +646,110 @@ static void sdhci_set_gl9763e_signaling(struct sdhci_host *host, sdhci_writew(host, ctrl_2, SDHCI_HOST_CONTROL2); } +static void sdhci_gl9763e_dumpregs(struct mmc_host *mmc) +{ + sdhci_dumpregs(mmc_priv(mmc)); +} + +static void sdhci_gl9763e_cqe_pre_enable(struct mmc_host *mmc) +{ + struct cqhci_host *cq_host = mmc->cqe_private; + u32 value; + + value = cqhci_readl(cq_host, CQHCI_CFG); + value |= CQHCI_ENABLE; + cqhci_writel(cq_host, value, CQHCI_CFG); +} + +static void sdhci_gl9763e_cqe_enable(struct mmc_host *mmc) +{ + struct sdhci_host *host = mmc_priv(mmc); + + sdhci_writew(host, GLI_9763E_CQE_TRNS_MODE, SDHCI_TRANSFER_MODE); + sdhci_cqe_enable(mmc); +} + +static u32 sdhci_gl9763e_cqhci_irq(struct sdhci_host *host, u32 intmask) +{ + int cmd_error = 0; + int data_error = 0; + + if (!sdhci_cqe_irq(host, intmask, &cmd_error, &data_error)) + return intmask; + + cqhci_irq(host->mmc, intmask, cmd_error, data_error); + + return 0; +} + +static void sdhci_gl9763e_cqe_post_disable(struct mmc_host *mmc) +{ + struct sdhci_host *host = mmc_priv(mmc); + struct cqhci_host *cq_host = mmc->cqe_private; + u32 value; + + value = cqhci_readl(cq_host, CQHCI_CFG); + value &= ~CQHCI_ENABLE; + cqhci_writel(cq_host, value, CQHCI_CFG); + sdhci_writew(host, 0x0, SDHCI_TRANSFER_MODE); +} + +static const struct cqhci_host_ops sdhci_gl9763e_cqhci_ops = { + .enable = sdhci_gl9763e_cqe_enable, + .disable = sdhci_cqe_disable, + .dumpregs = sdhci_gl9763e_dumpregs, + .pre_enable = sdhci_gl9763e_cqe_pre_enable, + .post_disable = sdhci_gl9763e_cqe_post_disable, +}; + +static int gl9763e_add_host(struct sdhci_pci_slot *slot) +{ + struct device *dev = &slot->chip->pdev->dev; + struct sdhci_host *host = slot->host; + struct cqhci_host *cq_host; + bool dma64; + int ret; + + ret = sdhci_setup_host(host); + if (ret) + return ret; + + cq_host = devm_kzalloc(dev, sizeof(*cq_host), GFP_KERNEL); + if (!cq_host) { + ret = -ENOMEM; + goto cleanup; + } + + cq_host->mmio = host->ioaddr + SDHCI_GLI_9763E_CQE_BASE_ADDR; + cq_host->ops = &sdhci_gl9763e_cqhci_ops; + + dma64 = host->flags & SDHCI_USE_64_BIT_DMA; + if (dma64) + cq_host->caps |= CQHCI_TASK_DESC_SZ_128; + + ret = cqhci_init(cq_host, host->mmc, dma64); + if (ret) + goto cleanup; + + ret = __sdhci_add_host(host); + if (ret) + goto cleanup; + + return 0; + +cleanup: + sdhci_cleanup_host(host); + return ret; +} + +static void sdhci_gl9763e_reset(struct sdhci_host *host, u8 mask) +{ + if ((host->mmc->caps2 & MMC_CAP2_CQE) && (mask & SDHCI_RESET_ALL) && + host->mmc->cqe_private) + cqhci_deactivate(host->mmc); + sdhci_reset(host, mask); +} + static void gli_set_gl9763e(struct sdhci_pci_slot *slot) { struct pci_dev *pdev = slot->chip->pdev; @@ -636,7 +772,9 @@ static void gli_set_gl9763e(struct sdhci_pci_slot *slot) static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot) { + struct pci_dev *pdev = slot->chip->pdev; struct sdhci_host *host = slot->host; + u32 value; host->mmc->caps |= MMC_CAP_8_BIT_DATA | MMC_CAP_1_8V_DDR | @@ -646,6 +784,11 @@ static int gli_probe_slot_gl9763e(struct sdhci_pci_slot *slot) MMC_CAP2_HS400_ES | MMC_CAP2_NO_SDIO | MMC_CAP2_NO_SD; + + pci_read_config_dword(pdev, PCIE_GLI_9763E_MB, &value); + if (!(value & GLI_9763E_MB_CMDQ_OFF)) + host->mmc->caps2 |= MMC_CAP2_CQE | MMC_CAP2_CQE_DCMD; + gli_pcie_enable_msi(slot); host->mmc_host_ops.hs400_enhanced_strobe = gl9763e_hs400_enhanced_strobe; @@ -699,9 +842,10 @@ static const struct sdhci_ops sdhci_gl9763e_ops = { .set_clock = sdhci_set_clock, .enable_dma = sdhci_pci_enable_dma, .set_bus_width = sdhci_set_bus_width, - .reset = sdhci_reset, + .reset = sdhci_gl9763e_reset, .set_uhs_signaling = sdhci_set_gl9763e_signaling, .voltage_switch = sdhci_gli_voltage_switch, + .irq = sdhci_gl9763e_cqhci_irq, }; const struct sdhci_pci_fixes sdhci_gl9763e = { @@ -709,6 +853,8 @@ const struct sdhci_pci_fixes sdhci_gl9763e = { .probe_slot = gli_probe_slot_gl9763e, .ops = &sdhci_gl9763e_ops, #ifdef CONFIG_PM_SLEEP - .resume = sdhci_pci_gli_resume, + .resume = sdhci_cqhci_gli_resume, + .suspend = sdhci_cqhci_gli_suspend, #endif + .add_host = gl9763e_add_host, }; From 437490fed3b0c9ae21af8f70e0f338d34560842b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 28 Jul 2020 09:42:49 +0800 Subject: [PATCH 108/271] btrfs: tracepoints: output proper root owner for trace_find_free_extent() The current trace event always output result like this: find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA) find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=4(METADATA) find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=2(EXTENT_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA) find_free_extent: root=2(EXTENT_TREE) len=4096 empty_size=0 flags=1(DATA) T's saying we're allocating data extent for EXTENT tree, which is not even possible. It's because we always use EXTENT tree as the owner for trace_find_free_extent() without using the @root from btrfs_reserve_extent(). This patch will change the parameter to use proper @root for trace_find_free_extent(): Now it looks much better: find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=4096 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=8192 empty_size=0 flags=1(DATA) find_free_extent: root=5(FS_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=7(CSUM_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=2(EXTENT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) find_free_extent: root=1(ROOT_TREE) len=16384 empty_size=0 flags=36(METADATA|DUP) Reported-by: Hans van Kranenburg CC: stable@vger.kernel.org # 5.4+ Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 ++++--- include/trace/events/btrfs.h | 10 ++++++---- 2 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 780b9c9a98fe..dbff61d36cab 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -3918,11 +3918,12 @@ static int prepare_allocation(struct btrfs_fs_info *fs_info, * |- Push harder to find free extents * |- If not found, re-iterate all block groups */ -static noinline int find_free_extent(struct btrfs_fs_info *fs_info, +static noinline int find_free_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes, u64 empty_size, u64 hint_byte_orig, struct btrfs_key *ins, u64 flags, int delalloc) { + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; int cache_block_group_error = 0; struct btrfs_block_group *block_group = NULL; @@ -3954,7 +3955,7 @@ static noinline int find_free_extent(struct btrfs_fs_info *fs_info, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(fs_info, num_bytes, empty_size, flags); + trace_find_free_extent(root, num_bytes, empty_size, flags); space_info = btrfs_find_space_info(fs_info, flags); if (!space_info) { @@ -4203,7 +4204,7 @@ int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, flags = get_alloc_profile_by_root(root, is_data); again: WARN_ON(num_bytes < fs_info->sectorsize); - ret = find_free_extent(fs_info, ram_bytes, num_bytes, empty_size, + ret = find_free_extent(root, ram_bytes, num_bytes, empty_size, hint_byte, ins, flags, delalloc); if (!ret && !is_data) { btrfs_dec_block_group_reservations(fs_info, ins->objectid); diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 863335ecb7e8..b9241836d4f7 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -1176,25 +1176,27 @@ DEFINE_EVENT(btrfs__reserved_extent, btrfs_reserved_extent_free, TRACE_EVENT(find_free_extent, - TP_PROTO(const struct btrfs_fs_info *fs_info, u64 num_bytes, + TP_PROTO(const struct btrfs_root *root, u64 num_bytes, u64 empty_size, u64 data), - TP_ARGS(fs_info, num_bytes, empty_size, data), + TP_ARGS(root, num_bytes, empty_size, data), TP_STRUCT__entry_btrfs( + __field( u64, root_objectid ) __field( u64, num_bytes ) __field( u64, empty_size ) __field( u64, data ) ), - TP_fast_assign_btrfs(fs_info, + TP_fast_assign_btrfs(root->fs_info, + __entry->root_objectid = root->root_key.objectid; __entry->num_bytes = num_bytes; __entry->empty_size = empty_size; __entry->data = data; ), TP_printk_btrfs("root=%llu(%s) len=%llu empty_size=%llu flags=%llu(%s)", - show_root_type(BTRFS_EXTENT_TREE_OBJECTID), + show_root_type(__entry->root_objectid), __entry->num_bytes, __entry->empty_size, __entry->data, __print_flags((unsigned long)__entry->data, "|", BTRFS_GROUP_FLAGS)) From 260db43cd2f556677f6ae818ba09f997eed81004 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Tue, 4 Aug 2020 19:48:34 -0700 Subject: [PATCH 109/271] btrfs: delete duplicated words + other fixes in comments Delete repeated words in fs/btrfs/. {to, the, a, and old} and change "into 2 part" to "into 2 parts". Reviewed-by: Nikolay Borisov Signed-off-by: Randy Dunlap Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 +- fs/btrfs/ctree.c | 2 +- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent_io.c | 2 +- fs/btrfs/free-space-cache.c | 2 +- fs/btrfs/qgroup.c | 2 +- fs/btrfs/tree-log.c | 4 ++-- 7 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index ea8aaf36647e..fb507a389f40 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2785,7 +2785,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) * finished yet (no block group item in the extent tree * yet, etc). If this is the case, wait for all free * space endio workers to finish and retry. This is a - * a very rare case so no need for a more efficient and + * very rare case so no need for a more efficient and * complex approach. */ if (ret == -ENOENT) { diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index cd392da69b81..e0fd8a34efcc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -5115,7 +5115,7 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, slot--; /* * check this node pointer against the min_trans parameters. - * If it is too old, old, skip to the next one. + * If it is too old, skip to the next one. */ while (slot < nritems) { u64 gen; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 9f72b092bc22..b1f6662f89c4 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2928,7 +2928,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device } /* - * Verify the type first, if that or the the checksum value are + * Verify the type first, if that or the checksum value are * corrupted, we'll find out */ csum_type = btrfs_super_csum_type(disk_super); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a940edb1e64f..0320ddb0133e 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3241,7 +3241,7 @@ static int __do_readpage(struct page *page, /* * If we have a file range that points to a compressed extent - * and it's followed by a consecutive file range that points to + * and it's followed by a consecutive file range that points * to the same compressed extent (possibly with a different * offset and/or length, so it either points to the whole extent * or only part of it), we must make sure we do not submit a diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index dc82fd0c80cb..8759f5a1d6a0 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -1353,7 +1353,7 @@ static int __btrfs_write_out_cache(struct btrfs_root *root, struct inode *inode, /* * at this point the pages are under IO and we're happy, - * The caller is responsible for waiting on them and updating the + * The caller is responsible for waiting on them and updating * the cache and the inode */ io_ctl->entries = entries; diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index c0f350c3a0cf..580899bdb991 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -2315,7 +2315,7 @@ static int qgroup_update_refcnt(struct btrfs_fs_info *fs_info, * Update qgroup rfer/excl counters. * Rfer update is easy, codes can explain themselves. * - * Excl update is tricky, the update is split into 2 part. + * Excl update is tricky, the update is split into 2 parts. * Part 1: Possible exclusive <-> sharing detect: * | A | !A | * ------------------------------------- diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 39da9db35278..a6e69378b706 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4883,7 +4883,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * Check the inode's logged_trans only instead of * btrfs_inode_in_log(). This is because the last_log_commit of * the inode is not updated when we only log that it exists and - * and it has the full sync bit set (see btrfs_log_inode()). + * it has the full sync bit set (see btrfs_log_inode()). */ if (BTRFS_I(inode)->logged_trans == trans->transid) { spin_unlock(&BTRFS_I(inode)->lock); @@ -6380,7 +6380,7 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT * otherwise. * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * to sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, + * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be * committed (without attempting to sync the log). */ From 57297c1e8e1ce030050bf07ffa7182aaa588e2ac Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Aug 2020 12:43:18 +0300 Subject: [PATCH 110/271] btrfs: remove spurious BUG_ON in btrfs_get_extent That BUG_ON cannot ever trigger because as the comment there states - 'err' is always set. Simply remove it as it brings no value. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 9570458aa847..df08ace5d914 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6748,7 +6748,6 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, free_extent_map(em); return ERR_PTR(err); } - BUG_ON(!em); /* Error is always set */ return em; } From 8e5600818022bd329a5846dd6b5242965644f150 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 16:18:51 +0300 Subject: [PATCH 111/271] btrfs: remove fsid argument from btrfs_sysfs_update_sprout_fsid It can be accessed from 'fs_devices' as it's identical to fs_info->fs_devices. Also add a comment about why we are calling the function. No semantic changes. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 6 +++--- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 7 +++++-- 3 files changed, 9 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 5be30066563c..d7c58c89fb9c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1328,8 +1328,8 @@ void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) &disk_to_dev(bdev->bd_disk)->kobj); } -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid) +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices) + { char fsid_buf[BTRFS_UUID_UNPARSED_SIZE]; @@ -1337,7 +1337,7 @@ void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, * Sprouting changes fsid of the mounted filesystem, rename the fsid * directory */ - snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fsid); + snprintf(fsid_buf, BTRFS_UUID_UNPARSED_SIZE, "%pU", fs_devices->fsid); if (kobject_rename(&fs_devices->fsid_kobj, fsid_buf)) btrfs_warn(fs_devices->fs_info, "sysfs: failed to create fsid for sprout"); diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index cf839c46a131..c9efa15f96e0 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -20,8 +20,7 @@ int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); -void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices, - const u8 *fsid); +void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, u64 bit, enum btrfs_feature_set set); void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 1997a7d67f22..5d73c287314a 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2645,8 +2645,11 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path goto error_sysfs; } - btrfs_sysfs_update_sprout_fsid(fs_devices, - fs_info->fs_devices->fsid); + /* + * fs_devices now represents the newly sprouted filesystem and + * its fsid has been changed by btrfs_prepare_sprout + */ + btrfs_sysfs_update_sprout_fsid(fs_devices); } ret = btrfs_commit_transaction(trans); From b49121393f583635b78abae4037fa1e772b2a042 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:12 -0400 Subject: [PATCH 112/271] btrfs: change nr to u64 in btrfs_start_delalloc_roots We have btrfs_wait_ordered_roots() which takes a u64 for nr, but btrfs_start_delalloc_roots() that takes an int for nr, which makes using them in conjunction, especially for something like (u64)-1, annoying and inconsistent. Fix btrfs_start_delalloc_roots() to take a u64 for nr and adjust start_delalloc_inodes() and it's callers appropriately. This means we've adjusted start_delalloc_inodes() to take a pointer of nr since we want to preserve the ability for start-delalloc_inodes() to return an error, so simply make it do the nr adjusting as necessary. Part of adjusting the callers to this means changing btrfs_writeback_inodes_sb_nr() to take a u64 for items. This may be confusing because it seems unrelated, but the caller of btrfs_writeback_inodes_sb_nr() already passes in a u64, it's just the function variable that needs to be changed. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/inode.c | 26 ++++++++++---------------- fs/btrfs/ioctl.c | 2 +- fs/btrfs/space-info.c | 2 +- 5 files changed, 14 insertions(+), 20 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9a72896bed2e..e2ec4f067857 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2956,7 +2956,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, u32 min_type); int btrfs_start_delalloc_snapshot(struct btrfs_root *root); -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr); +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, unsigned int extra_bits, struct extent_state **cached_state); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index e4a1c6afe35d..61d800a149b0 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -661,7 +661,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, * flush all outstanding I/O and inode extent mappings before the * copy operation is declared as being finished */ - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) { mutex_unlock(&dev_replace->lock_finishing_cancel_unmount); return ret; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index df08ace5d914..48b8b5f09f47 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9387,7 +9387,7 @@ static struct btrfs_delalloc_work *btrfs_alloc_delalloc_work(struct inode *inode * some fairly slow code that needs optimization. This walks the list * of all the inodes with pending delalloc and forces them to disk. */ -static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) +static int start_delalloc_inodes(struct btrfs_root *root, u64 *nr, bool snapshot) { struct btrfs_inode *binode; struct inode *inode; @@ -9427,9 +9427,11 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) list_add_tail(&work->list, &works); btrfs_queue_work(root->fs_info->flush_workers, &work->work); - ret++; - if (nr != -1 && ret >= nr) - goto out; + if (*nr != U64_MAX) { + (*nr)--; + if (*nr == 0) + goto out; + } cond_resched(); spin_lock(&root->delalloc_lock); } @@ -9454,18 +9456,15 @@ static int start_delalloc_inodes(struct btrfs_root *root, int nr, bool snapshot) int btrfs_start_delalloc_snapshot(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - int ret; + u64 nr = U64_MAX; if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - ret = start_delalloc_inodes(root, -1, true); - if (ret > 0) - ret = 0; - return ret; + return start_delalloc_inodes(root, &nr, true); } -int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) +int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, u64 nr) { struct btrfs_root *root; struct list_head splice; @@ -9488,15 +9487,10 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, int nr) &fs_info->delalloc_roots); spin_unlock(&fs_info->delalloc_root_lock); - ret = start_delalloc_inodes(root, nr, false); + ret = start_delalloc_inodes(root, &nr, false); btrfs_put_root(root); if (ret < 0) goto out; - - if (nr != -1) { - nr -= ret; - WARN_ON(nr < 0); - } spin_lock(&fs_info->delalloc_root_lock); } spin_unlock(&fs_info->delalloc_root_lock); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2d9109d9e98f..a5355a16eabb 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -4897,7 +4897,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SYNC: { int ret; - ret = btrfs_start_delalloc_roots(fs_info, -1); + ret = btrfs_start_delalloc_roots(fs_info, U64_MAX); if (ret) return ret; ret = btrfs_sync_fs(inode->i_sb, 1); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 475968ccbd1d..266ff14e4c8c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -477,7 +477,7 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, } static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, int nr_items) + unsigned long nr_pages, u64 nr_items) { struct super_block *sb = fs_info->sb; From 288be2d997360c71a1126261680d95ad9df5932a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:13 -0400 Subject: [PATCH 113/271] btrfs: remove orig from shrink_delalloc We don't use this anywhere inside of shrink_delalloc since 17024ad0a0fd ("Btrfs: fix early ENOSPC due to delalloc"), remove it. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 266ff14e4c8c..9c79bdfd145b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -517,7 +517,7 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, * shrink metadata reservation for delalloc */ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - u64 orig, bool wait_ordered) + bool wait_ordered) { struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; @@ -742,7 +742,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, num_bytes, + shrink_delalloc(fs_info, num_bytes * 2, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: From d7f81fac97e6d06c4e9bf090085137aa2dddff99 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:14 -0400 Subject: [PATCH 114/271] btrfs: handle U64_MAX for shrink_delalloc Data allocations are going to want to pass in U64_MAX for flushing space, adjust shrink_delalloc to handle this properly. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9c79bdfd145b..2b48b37216eb 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -530,8 +530,19 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, int loops; /* Calc the number of the pages we need flush for space reservation */ - items = calc_reclaim_items_nr(fs_info, to_reclaim); - to_reclaim = items * EXTENT_SIZE_PER_ITEM; + if (to_reclaim == U64_MAX) { + items = U64_MAX; + } else { + /* + * to_reclaim is set to however much metadata we need to + * reclaim, but reclaiming that much data doesn't really track + * exactly, so increase the amount to reclaim by 2x in order to + * make sure we're flushing enough delalloc to hopefully reclaim + * some metadata reservations. + */ + items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2; + to_reclaim = items * EXTENT_SIZE_PER_ITEM; + } trans = (struct btrfs_trans_handle *)current->journal_info; space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); @@ -742,7 +753,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes * 2, + shrink_delalloc(fs_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: From 920a9958c2553e4e23f9ff42dca2907ecf9a5fa0 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:15 -0400 Subject: [PATCH 115/271] btrfs: make shrink_delalloc take space_info as an arg Currently shrink_delalloc just looks up the metadata space info, but this won't work if we're trying to reclaim space for data chunks. We get the right space_info we want passed into flush_space, so simply pass that along to shrink_delalloc. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2b48b37216eb..3aaeff287b9c 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -516,10 +516,10 @@ static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, /* * shrink metadata reservation for delalloc */ -static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, - bool wait_ordered) +static void shrink_delalloc(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + u64 to_reclaim, bool wait_ordered) { - struct btrfs_space_info *space_info; struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; @@ -545,7 +545,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, u64 to_reclaim, } trans = (struct btrfs_trans_handle *)current->journal_info; - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); @@ -753,7 +752,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; case FLUSH_DELALLOC: case FLUSH_DELALLOC_WAIT: - shrink_delalloc(fs_info, num_bytes, + shrink_delalloc(fs_info, space_info, num_bytes, state == FLUSH_DELALLOC_WAIT); break; case FLUSH_DELAYED_REFS_NR: From c6c453032ea3b9033f0a312175aab6519ff9bdf7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:16 -0400 Subject: [PATCH 116/271] btrfs: make ALLOC_CHUNK use the space info flags We have traditionally used flush_space() to flush metadata space, so we've been unconditionally using btrfs_metadata_alloc_profile() for our profile to allocate a chunk. However if we're going to use this for data we need to use btrfs_get_alloc_profile() on the space_info we pass in. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 3aaeff287b9c..ed2ca20070f3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -777,7 +777,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, break; } ret = btrfs_chunk_alloc(trans, - btrfs_metadata_alloc_profile(fs_info), + btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); btrfs_end_transaction(trans); From 3308234a7e9828f8cbec308010348ddf712fc15e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:17 -0400 Subject: [PATCH 117/271] btrfs: call btrfs_try_granting_tickets when freeing reserved bytes We were missing a call to btrfs_try_granting_tickets in btrfs_free_reserved_bytes, so add it to handle the case where we're able to satisfy an allocation because we've freed a pending reservation. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index fb507a389f40..bc95f5f963fb 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2994,6 +2994,8 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, if (delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); + + btrfs_try_granting_tickets(cache->fs_info, space_info); spin_unlock(&space_info->lock); } From 2732798c9bb647978043e4daf54a6f9f3025d4c5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:18 -0400 Subject: [PATCH 118/271] btrfs: call btrfs_try_granting_tickets when unpinning anything When unpinning we were only calling btrfs_try_granting_tickets() if global_rsv->space_info == space_info, which is problematic because we use ticketing for SYSTEM chunks, and want to use it for DATA as well. Fix this by moving this call outside of that if statement. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index dbff61d36cab..79c4432e6a3f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2849,11 +2849,10 @@ static int unpin_extent_range(struct btrfs_fs_info *fs_info, len -= to_add; } spin_unlock(&global_rsv->lock); - /* Add to any tickets we may have */ - if (len) - btrfs_try_granting_tickets(fs_info, - space_info); } + /* Add to any tickets we may have */ + if (!readonly && return_free_space && len) + btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } From 99ffb43e5d4a0710fb094f3efbdf47f85f3b87c9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:19 -0400 Subject: [PATCH 119/271] btrfs: call btrfs_try_granting_tickets when reserving space If we have compression on we could free up more space than we reserved, and thus be able to make a space reservation. Add the call for this scenario. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index bc95f5f963fb..c9d7465501a4 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2961,6 +2961,13 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, space_info, -ram_bytes); if (delalloc) cache->delalloc_bytes += num_bytes; + + /* + * Compression can use less space than we reserved, so wake + * tickets if that happens + */ + if (num_bytes < ram_bytes) + btrfs_try_granting_tickets(cache->fs_info, space_info); } spin_unlock(&cache->lock); spin_unlock(&space_info->lock); From 39753e4a3a43c7d14a17f626f32d4a53ee59d563 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:20 -0400 Subject: [PATCH 120/271] btrfs: use the btrfs_space_info_free_bytes_may_use helper for delalloc We are going to use the ticket infrastructure for data, so use the btrfs_space_info_free_bytes_may_use() helper in btrfs_free_reserved_data_space_noquota() so we get the btrfs_try_granting_tickets call when we free our reservation. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Tested-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0e354e9e57d0..0a41bdcc14d1 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -277,9 +277,7 @@ void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); data_sinfo = fs_info->data_sinfo; - spin_lock(&data_sinfo->lock); - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, -len); - spin_unlock(&data_sinfo->lock); + btrfs_space_info_free_bytes_may_use(fs_info, data_sinfo, len); } /* From 38d715f494f2f1dddbf3d0c6e50aefff49519232 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:21 -0400 Subject: [PATCH 121/271] btrfs: use btrfs_start_delalloc_roots in shrink_delalloc The original iteration of flushing had us flushing delalloc and then checking to see if we could make our reservation, thus we were very careful about how many pages we would flush at once. But now that everything is async and we satisfy tickets as the space becomes available we don't have to keep track of any of this, simply try and flush the number of dirty inodes we may have in order to reclaim space to make our reservation. This cleans up our delalloc flushing significantly. The async_pages stuff is dropped because btrfs_start_delalloc_roots() handles the case that we generate async extents for us, so we no longer require this extra logic. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 55 +------------------------------------------ 1 file changed, 1 insertion(+), 54 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ed2ca20070f3..4b8753b1a628 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -476,28 +476,6 @@ void btrfs_dump_space_info(struct btrfs_fs_info *fs_info, up_read(&info->groups_sem); } -static void btrfs_writeback_inodes_sb_nr(struct btrfs_fs_info *fs_info, - unsigned long nr_pages, u64 nr_items) -{ - struct super_block *sb = fs_info->sb; - - if (down_read_trylock(&sb->s_umount)) { - writeback_inodes_sb_nr(sb, nr_pages, WB_REASON_FS_FREE_SPACE); - up_read(&sb->s_umount); - } else { - /* - * We needn't worry the filesystem going from r/w to r/o though - * we don't acquire ->s_umount mutex, because the filesystem - * should guarantee the delalloc inodes list be empty after - * the filesystem is readonly(all dirty pages are written to - * the disk). - */ - btrfs_start_delalloc_roots(fs_info, nr_items); - if (!current->journal_info) - btrfs_wait_ordered_roots(fs_info, nr_items, 0, (u64)-1); - } -} - static inline u64 calc_reclaim_items_nr(struct btrfs_fs_info *fs_info, u64 to_reclaim) { @@ -523,10 +501,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, struct btrfs_trans_handle *trans; u64 delalloc_bytes; u64 dio_bytes; - u64 async_pages; u64 items; long time_left; - unsigned long nr_pages; int loops; /* Calc the number of the pages we need flush for space reservation */ @@ -567,37 +543,8 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, loops = 0; while ((delalloc_bytes || dio_bytes) && loops < 3) { - nr_pages = min(delalloc_bytes, to_reclaim) >> PAGE_SHIFT; + btrfs_start_delalloc_roots(fs_info, items); - /* - * Triggers inode writeback for up to nr_pages. This will invoke - * ->writepages callback and trigger delalloc filling - * (btrfs_run_delalloc_range()). - */ - btrfs_writeback_inodes_sb_nr(fs_info, nr_pages, items); - - /* - * We need to wait for the compressed pages to start before - * we continue. - */ - async_pages = atomic_read(&fs_info->async_delalloc_pages); - if (!async_pages) - goto skip_async; - - /* - * Calculate how many compressed pages we want to be written - * before we continue. I.e if there are more async pages than we - * require wait_event will wait until nr_pages are written. - */ - if (async_pages <= nr_pages) - async_pages = 0; - else - async_pages -= nr_pages; - - wait_event(fs_info->async_submit_wait, - atomic_read(&fs_info->async_delalloc_pages) <= - (int)async_pages); -skip_async: spin_lock(&space_info->lock); if (list_empty(&space_info->tickets) && list_empty(&space_info->priority_tickets)) { From 448b966b49be55af8f4aa79b24d7896809fd8e67 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:22 -0400 Subject: [PATCH 122/271] btrfs: check tickets after waiting on ordered extents Right now if the space is freed up after the ordered extents complete (which is likely since the reservations are held until they complete), we would do extra delalloc flushing before we'd notice that we didn't have any more tickets. Fix this by moving the tickets check after our wait_ordered_extents check. Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 4b8753b1a628..6a9bc5cc487b 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -545,14 +545,6 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, while ((delalloc_bytes || dio_bytes) && loops < 3) { btrfs_start_delalloc_roots(fs_info, items); - spin_lock(&space_info->lock); - if (list_empty(&space_info->tickets) && - list_empty(&space_info->priority_tickets)) { - spin_unlock(&space_info->lock); - break; - } - spin_unlock(&space_info->lock); - loops++; if (wait_ordered && !trans) { btrfs_wait_ordered_roots(fs_info, items, 0, (u64)-1); @@ -561,6 +553,15 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, if (time_left) break; } + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets) && + list_empty(&space_info->priority_tickets)) { + spin_unlock(&space_info->lock); + break; + } + spin_unlock(&space_info->lock); + delalloc_bytes = percpu_counter_sum_positive( &fs_info->delalloc_bytes); dio_bytes = percpu_counter_sum_positive(&fs_info->dio_bytes); From 058e6d1d267fe9c7e072533d2e1469d93aec3ec7 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:23 -0400 Subject: [PATCH 123/271] btrfs: add flushing states for handling data reservations Currently the way we do data reservations is by seeing if we have enough space in our space_info. If we do not and we're a normal inode we'll 1) Attempt to force a chunk allocation until we can't anymore. 2) If that fails we'll flush delalloc, then commit the transaction, then run the delayed iputs. If we are a free space inode we're only allowed to force a chunk allocation. In order to use the normal flushing mechanism we need to encode this into a flush state array for normal inodes. Since both will start with allocating chunks until the space info is full there is no need to add this as a flush state, this will be handled specially. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/space-info.c | 6 ++++++ 2 files changed, 8 insertions(+) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index e2ec4f067857..098f006fc5a3 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2592,6 +2592,8 @@ enum btrfs_reserve_flush_enum { * * Can be interruped by fatal signal. */ + BTRFS_RESERVE_FLUSH_DATA, + BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE, BTRFS_RESERVE_FLUSH_ALL, /* diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 6a9bc5cc487b..980b6f641e78 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1017,6 +1017,12 @@ static const enum btrfs_flush_state evict_flush_states[] = { COMMIT_TRANS, }; +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + COMMIT_TRANS, + RUN_DELAYED_IPUTS, +}; + static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, From a1ed0a8216f7c7305ccfaa2c93b498f10b340ede Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:24 -0400 Subject: [PATCH 124/271] btrfs: add the data transaction commit logic into may_commit_transaction Data space flushing currently unconditionally commits the transaction twice in a row, and the last time it checks if there's enough pinned extents to satisfy its reservation before deciding to commit the transaction for the 3rd and final time. Encode this logic into may_commit_transaction(). In the next patch we will pass in U64_MAX for bytes_needed the first two times, and the final time we will pass in the actual bytes we need so the normal logic will apply. This patch exists solely to make the logical changes I will make to the flushing state machine separate to make it easier to bisect any performance related regressions. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 31 ++++++++++++++++++++++++------- 1 file changed, 24 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 980b6f641e78..abd2b8fb1833 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -579,21 +579,33 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info) + struct btrfs_space_info *space_info, + u64 bytes_needed) { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv; struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; - u64 bytes_needed; u64 reclaim_bytes = 0; u64 cur_free_bytes = 0; + bool do_commit = false; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; + /* + * If we are data and have passed in U64_MAX we just want to + * unconditionally commit the transaction to match the previous data + * flushing behavior. + */ + if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && + bytes_needed == U64_MAX) { + do_commit = true; + goto check_pinned; + } + spin_lock(&space_info->lock); cur_free_bytes = btrfs_space_info_used(space_info, true); if (cur_free_bytes < space_info->total_bytes) @@ -607,7 +619,8 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, else if (!list_empty(&space_info->tickets)) ticket = list_first_entry(&space_info->tickets, struct reserve_ticket, list); - bytes_needed = (ticket) ? ticket->bytes : 0; + if (ticket) + bytes_needed = ticket->bytes; if (bytes_needed > cur_free_bytes) bytes_needed -= cur_free_bytes; @@ -618,6 +631,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes_needed) return 0; +check_pinned: trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -627,15 +641,18 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, * we have block groups that are going to be freed, allowing us to * possibly do a chunk allocation the next loop through. */ - if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + if (do_commit || + test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || __percpu_counter_compare(&space_info->total_bytes_pinned, bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) goto commit; /* - * See if there is some space in the delayed insertion reservation for - * this reservation. + * See if there is some space in the delayed insertion reserve for this + * reservation. If the space_info's don't match (like for DATA or + * SYSTEM) then just go enospc, reclaiming this space won't recover any + * space to satisfy those reservations. */ if (space_info != delayed_rsv->space_info) goto enospc; @@ -742,7 +759,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info); + ret = may_commit_transaction(fs_info, space_info, num_bytes); break; default: ret = -ENOSPC; From 8698fc4eb7884eeba29adc4193f9d05fa2bed16b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:25 -0400 Subject: [PATCH 125/271] btrfs: add btrfs_reserve_data_bytes and use it Create a new function btrfs_reserve_data_bytes() in order to handle data reservations. This uses the new flush types and flush states to handle making data reservations. This patch specifically does not change any functionality, and is purposefully not cleaned up in order to make bisection easier for the future patches. The new helper is identical to the old helper in how it handles data reservations. We first try to force a chunk allocation, and then we run through the flush states all at once and in the same order that they were done with the old helper. Subsequent patches will clean this up and change the behavior of the flushing, and it is important to keep those changes separate so we can easily bisect down to the patch that caused the regression, rather than the patch that made us start using the new infrastructure. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/delalloc-space.c | 119 ++------------------------------------ fs/btrfs/space-info.c | 92 +++++++++++++++++++++++++++++ fs/btrfs/space-info.h | 2 + 3 files changed, 98 insertions(+), 115 deletions(-) diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 0a41bdcc14d1..bacee09b7bfd 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -115,126 +115,15 @@ int btrfs_alloc_data_chunk_ondemand(struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = 0; - int need_commit = 2; - int have_pinned_space; + enum btrfs_reserve_flush_enum flush = BTRFS_RESERVE_FLUSH_DATA; /* Make sure bytes are sectorsize aligned */ bytes = ALIGN(bytes, fs_info->sectorsize); - if (btrfs_is_free_space_inode(inode)) { - need_commit = 0; - ASSERT(current->journal_info); - } + if (btrfs_is_free_space_inode(inode)) + flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; -again: - /* Make sure we have enough space to handle the data first */ - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - - if (used + bytes > data_sinfo->total_bytes) { - struct btrfs_trans_handle *trans; - - /* - * If we don't have enough free bytes in this space then we need - * to alloc a new chunk. - */ - if (!data_sinfo->full) { - u64 alloc_target; - - data_sinfo->force_alloc = CHUNK_ALLOC_FORCE; - spin_unlock(&data_sinfo->lock); - - alloc_target = btrfs_data_alloc_profile(fs_info); - /* - * It is ugly that we don't call nolock join - * transaction for the free space inode case here. - * But it is safe because we only do the data space - * reservation for the free space cache in the - * transaction context, the common join transaction - * just increase the counter of the current transaction - * handler, doesn't try to acquire the trans_lock of - * the fs. - */ - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - - ret = btrfs_chunk_alloc(trans, alloc_target, - CHUNK_ALLOC_NO_FORCE); - btrfs_end_transaction(trans); - if (ret < 0) { - if (ret != -ENOSPC) - return ret; - else { - have_pinned_space = 1; - goto commit_trans; - } - } - - goto again; - } - - /* - * If we don't have enough pinned space to deal with this - * allocation, and no removed chunk in current transaction, - * don't bother committing the transaction. - */ - have_pinned_space = __percpu_counter_compare( - &data_sinfo->total_bytes_pinned, - used + bytes - data_sinfo->total_bytes, - BTRFS_TOTAL_BYTES_PINNED_BATCH); - spin_unlock(&data_sinfo->lock); - - /* Commit the current transaction and try again */ -commit_trans: - if (need_commit) { - need_commit--; - - if (need_commit > 0) { - btrfs_start_delalloc_roots(fs_info, -1); - btrfs_wait_ordered_roots(fs_info, U64_MAX, 0, - (u64)-1); - } - - trans = btrfs_join_transaction(root); - if (IS_ERR(trans)) - return PTR_ERR(trans); - if (have_pinned_space >= 0 || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, - &trans->transaction->flags) || - need_commit > 0) { - ret = btrfs_commit_transaction(trans); - if (ret) - return ret; - /* - * The cleaner kthread might still be doing iput - * operations. Wait for it to finish so that - * more space is released. We don't need to - * explicitly run the delayed iputs here because - * the commit_transaction would have woken up - * the cleaner. - */ - ret = btrfs_wait_on_delayed_iputs(fs_info); - if (ret) - return ret; - goto again; - } else { - btrfs_end_transaction(trans); - } - } - - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", - data_sinfo->flags, bytes, 1); - return -ENOSPC; - } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - spin_unlock(&data_sinfo->lock); - - return 0; + return btrfs_reserve_data_bytes(fs_info, bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index abd2b8fb1833..ad1dcb37593d 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1329,3 +1329,95 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, } return ret; } + +/** + * btrfs_reserve_data_bytes - try to reserve data bytes for an allocation + * @fs_info - the filesystem + * @bytes - the number of bytes we need + * @flush - how we are allowed to flush + * + * This will reserve bytes from the data space info. If there is not enough + * space then we will attempt to flush space as specified by flush. + */ +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush) +{ + struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + const enum btrfs_flush_state *states = NULL; + u64 used; + int states_nr = 0; + int commit_cycles = 2; + int ret = -ENOSPC; + + ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); + + if (flush == BTRFS_RESERVE_FLUSH_DATA) { + states = data_flush_states; + states_nr = ARRAY_SIZE(data_flush_states); + } + + spin_lock(&data_sinfo->lock); +again: + used = btrfs_space_info_used(data_sinfo, true); + + if (used + bytes > data_sinfo->total_bytes) { + u64 prev_total_bytes = data_sinfo->total_bytes; + int flush_state = 0; + + spin_unlock(&data_sinfo->lock); + + /* + * Everybody can force chunk allocation, so try this first to + * see if we can just bail here and make our reservation. + */ + flush_space(fs_info, data_sinfo, bytes, ALLOC_CHUNK_FORCE); + spin_lock(&data_sinfo->lock); + if (prev_total_bytes < data_sinfo->total_bytes) + goto again; + spin_unlock(&data_sinfo->lock); + + /* + * Cycle through the rest of the flushing options for our flush + * type, then try again. + */ + while (flush_state < states_nr) { + u64 flush_bytes = U64_MAX; + + /* + * Previously we unconditionally committed the + * transaction twice before finally checking against + * pinned space before committing the final time. We + * also skipped flushing delalloc the final pass + * through. + */ + if (!commit_cycles) { + if (states[flush_state] == FLUSH_DELALLOC_WAIT) { + flush_state++; + continue; + } + if (states[flush_state] == COMMIT_TRANS) + flush_bytes = bytes; + } + + flush_space(fs_info, data_sinfo, flush_bytes, + states[flush_state]); + flush_state++; + } + + if (!commit_cycles) + goto out; + + commit_cycles--; + spin_lock(&data_sinfo->lock); + goto again; + } + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + ret = 0; + spin_unlock(&data_sinfo->lock); +out: + if (ret) + trace_btrfs_space_reservation(fs_info, + "space_info:enospc", + data_sinfo->flags, bytes, 1); + return ret; +} diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index c3c64019950a..5646393b928c 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -149,5 +149,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(fs_info, space_info); spin_unlock(&space_info->lock); } +int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, + enum btrfs_reserve_flush_enum flush); #endif /* BTRFS_SPACE_INFO_H */ From 1004f6860f8c64f8d959051d6299981c09046cbe Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:26 -0400 Subject: [PATCH 126/271] btrfs: use ticketing for data space reservations Now that we have all the infrastructure in place, use the ticketing infrastructure to make data allocations. This still maintains the exact same flushing behavior, but now we're using tickets to get our reservations satisfied. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 122 ++++++++++++++++++++++-------------------- 1 file changed, 64 insertions(+), 58 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index ad1dcb37593d..9c118af363de 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1070,6 +1070,53 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, } while (flush_state < states_nr); } +static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, + struct reserve_ticket *ticket, + const enum btrfs_flush_state *states, + int states_nr) +{ + int flush_state = 0; + int commit_cycles = 2; + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + } +again: + while (flush_state < states_nr) { + u64 flush_bytes = U64_MAX; + + if (!commit_cycles) { + if (states[flush_state] == FLUSH_DELALLOC_WAIT) { + flush_state++; + continue; + } + if (states[flush_state] == COMMIT_TRANS) + flush_bytes = ticket->bytes; + } + + flush_space(fs_info, space_info, flush_bytes, states[flush_state]); + spin_lock(&space_info->lock); + if (ticket->bytes == 0) { + spin_unlock(&space_info->lock); + return; + } + spin_unlock(&space_info->lock); + flush_state++; + } + if (commit_cycles) { + commit_cycles--; + flush_state = 0; + goto again; + } +} + static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket) @@ -1136,6 +1183,13 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; + case BTRFS_RESERVE_FLUSH_DATA: + priority_reclaim_data_space(fs_info, space_info, ticket, + data_flush_states, ARRAY_SIZE(data_flush_states)); + break; + case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: + priority_reclaim_data_space(fs_info, space_info, ticket, NULL, 0); + break; default: ASSERT(0); break; @@ -1343,78 +1397,30 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - const enum btrfs_flush_state *states = NULL; u64 used; - int states_nr = 0; - int commit_cycles = 2; int ret = -ENOSPC; ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - if (flush == BTRFS_RESERVE_FLUSH_DATA) { - states = data_flush_states; - states_nr = ARRAY_SIZE(data_flush_states); - } - spin_lock(&data_sinfo->lock); -again: used = btrfs_space_info_used(data_sinfo, true); if (used + bytes > data_sinfo->total_bytes) { - u64 prev_total_bytes = data_sinfo->total_bytes; - int flush_state = 0; + struct reserve_ticket ticket; + init_waitqueue_head(&ticket.wait); + ticket.bytes = bytes; + ticket.error = 0; + list_add_tail(&ticket.list, &data_sinfo->priority_tickets); + data_sinfo->reclaim_size += bytes; spin_unlock(&data_sinfo->lock); - /* - * Everybody can force chunk allocation, so try this first to - * see if we can just bail here and make our reservation. - */ - flush_space(fs_info, data_sinfo, bytes, ALLOC_CHUNK_FORCE); - spin_lock(&data_sinfo->lock); - if (prev_total_bytes < data_sinfo->total_bytes) - goto again; + ret = handle_reserve_ticket(fs_info, data_sinfo, &ticket, flush); + } else { + btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); + ret = 0; spin_unlock(&data_sinfo->lock); - - /* - * Cycle through the rest of the flushing options for our flush - * type, then try again. - */ - while (flush_state < states_nr) { - u64 flush_bytes = U64_MAX; - - /* - * Previously we unconditionally committed the - * transaction twice before finally checking against - * pinned space before committing the final time. We - * also skipped flushing delalloc the final pass - * through. - */ - if (!commit_cycles) { - if (states[flush_state] == FLUSH_DELALLOC_WAIT) { - flush_state++; - continue; - } - if (states[flush_state] == COMMIT_TRANS) - flush_bytes = bytes; - } - - flush_space(fs_info, data_sinfo, flush_bytes, - states[flush_state]); - flush_state++; - } - - if (!commit_cycles) - goto out; - - commit_cycles--; - spin_lock(&data_sinfo->lock); - goto again; } - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - ret = 0; - spin_unlock(&data_sinfo->lock); -out: if (ret) trace_btrfs_space_reservation(fs_info, "space_info:enospc", From 0532a6f8b6ce64ced0ffcca0a2ab5d66a8eb9a6d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:27 -0400 Subject: [PATCH 127/271] btrfs: serialize data reservations if we are flushing Nikolay reported a problem where generic/371 would fail sometimes with a slow drive. The gist of the test is that we fallocate a file in parallel with a pwrite of a different file. These two files combined are smaller than the file system, but sometimes the pwrite would ENOSPC. A fair bit of investigation uncovered the fact that the fallocate workload was racing in and grabbing the free space that the pwrite workload was trying to free up so it could make its own reservation. After a few loops of this eventually the pwrite workload would error out with an ENOSPC. We've had the same problem with metadata as well, and we serialized all metadata allocations to satisfy this problem. This wasn't usually a problem with data because data reservations are more straightforward, but obviously could still happen. Fix this by not allowing reservations to occur if there are any pending tickets waiting to be satisfied on the space info. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 9c118af363de..fd9eff51e5b4 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1399,13 +1399,16 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; u64 used; int ret = -ENOSPC; + bool pending_tickets; ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); spin_lock(&data_sinfo->lock); used = btrfs_space_info_used(data_sinfo, true); + pending_tickets = !list_empty(&data_sinfo->tickets) || + !list_empty(&data_sinfo->priority_tickets); - if (used + bytes > data_sinfo->total_bytes) { + if (pending_tickets || used + bytes > data_sinfo->total_bytes) { struct reserve_ticket ticket; init_waitqueue_head(&ticket.wait); From f3bda421c16fe2f67cb3ff81b65ce8de331dd8e2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:28 -0400 Subject: [PATCH 128/271] btrfs: use the same helper for data and metadata reservations Now that data reservations follow the same pattern as metadata reservations we can simply rename __reserve_metadata_bytes to __reserve_bytes and use that helper for data reservations. Things to keep in mind, btrfs_can_overcommit() returns 0 for data, because we can never overcommit. We also will never pass in FLUSH_ALL for data, so we'll simply be added to the priority list and go straight into handle_reserve_ticket. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 46 ++++++++++++------------------------------- 1 file changed, 13 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index fd9eff51e5b4..87f92fc67542 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1249,10 +1249,9 @@ static inline bool is_normal_flushing(enum btrfs_reserve_flush_enum flush) * regain reservations will be made and this will fail if there is not enough * space already. */ -static int __reserve_metadata_bytes(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 orig_bytes, - enum btrfs_reserve_flush_enum flush) +static int __reserve_bytes(struct btrfs_fs_info *fs_info, + struct btrfs_space_info *space_info, u64 orig_bytes, + enum btrfs_reserve_flush_enum flush) { struct reserve_ticket ticket; u64 used; @@ -1364,8 +1363,7 @@ int btrfs_reserve_metadata_bytes(struct btrfs_root *root, struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv; int ret; - ret = __reserve_metadata_bytes(fs_info, block_rsv->space_info, - orig_bytes, flush); + ret = __reserve_bytes(fs_info, block_rsv->space_info, orig_bytes, flush); if (ret == -ENOSPC && unlikely(root->orphan_cleanup_state == ORPHAN_CLEANUP_STARTED)) { if (block_rsv != global_rsv && @@ -1397,36 +1395,18 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; - u64 used; - int ret = -ENOSPC; - bool pending_tickets; + int ret; + ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || + flush == BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - spin_lock(&data_sinfo->lock); - used = btrfs_space_info_used(data_sinfo, true); - pending_tickets = !list_empty(&data_sinfo->tickets) || - !list_empty(&data_sinfo->priority_tickets); - - if (pending_tickets || used + bytes > data_sinfo->total_bytes) { - struct reserve_ticket ticket; - - init_waitqueue_head(&ticket.wait); - ticket.bytes = bytes; - ticket.error = 0; - list_add_tail(&ticket.list, &data_sinfo->priority_tickets); - data_sinfo->reclaim_size += bytes; - spin_unlock(&data_sinfo->lock); - - ret = handle_reserve_ticket(fs_info, data_sinfo, &ticket, flush); - } else { - btrfs_space_info_update_bytes_may_use(fs_info, data_sinfo, bytes); - ret = 0; - spin_unlock(&data_sinfo->lock); - } - if (ret) - trace_btrfs_space_reservation(fs_info, - "space_info:enospc", + ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + if (ret == -ENOSPC) { + trace_btrfs_space_reservation(fs_info, "space_info:enospc", data_sinfo->flags, bytes, 1); + if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) + btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + } return ret; } From 028270013586681b02db49be20e05b56e662dc55 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:29 -0400 Subject: [PATCH 129/271] btrfs: drop the commit_cycles stuff for data reservations This was an old wart left over from how we previously did data reservations. Before we could have people race in and take a reservation while we were flushing space, so we needed to make sure we looped a few times before giving up. Now that we're using the ticketing infrastructure we don't have to worry about this and can drop the logic altogether. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 21 ++------------------- 1 file changed, 2 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 87f92fc67542..244cd2a61386 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1077,7 +1077,6 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, int states_nr) { int flush_state = 0; - int commit_cycles = 2; while (!space_info->full) { flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); @@ -1088,20 +1087,9 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, } spin_unlock(&space_info->lock); } -again: + while (flush_state < states_nr) { - u64 flush_bytes = U64_MAX; - - if (!commit_cycles) { - if (states[flush_state] == FLUSH_DELALLOC_WAIT) { - flush_state++; - continue; - } - if (states[flush_state] == COMMIT_TRANS) - flush_bytes = ticket->bytes; - } - - flush_space(fs_info, space_info, flush_bytes, states[flush_state]); + flush_space(fs_info, space_info, U64_MAX, states[flush_state]); spin_lock(&space_info->lock); if (ticket->bytes == 0) { spin_unlock(&space_info->lock); @@ -1110,11 +1098,6 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, spin_unlock(&space_info->lock); flush_state++; } - if (commit_cycles) { - commit_cycles--; - flush_state = 0; - goto again; - } } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, From bb86bd3db82ed6d28d4ab4ed33c7ee3b27290e49 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:30 -0400 Subject: [PATCH 130/271] btrfs: don't force commit if we are data We used to unconditionally commit the transaction at least 2 times and then on the 3rd try check against pinned space to make sure committing the transaction was worth the effort. This is overkill, we know nobody is going to steal our reservation, and if we can't make our reservation with the pinned amount simply bail out. This also cleans up the passing of bytes_needed to may_commit_transaction, as that was the thing we added into place in order to accomplish this behavior. We no longer need it so remove that mess. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 244cd2a61386..c229dffa46d8 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -579,8 +579,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info, * will return -ENOSPC. */ static int may_commit_transaction(struct btrfs_fs_info *fs_info, - struct btrfs_space_info *space_info, - u64 bytes_needed) + struct btrfs_space_info *space_info) { struct reserve_ticket *ticket = NULL; struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_block_rsv; @@ -588,24 +587,13 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *trans_rsv = &fs_info->trans_block_rsv; struct btrfs_trans_handle *trans; u64 reclaim_bytes = 0; + u64 bytes_needed = 0; u64 cur_free_bytes = 0; - bool do_commit = false; trans = (struct btrfs_trans_handle *)current->journal_info; if (trans) return -EAGAIN; - /* - * If we are data and have passed in U64_MAX we just want to - * unconditionally commit the transaction to match the previous data - * flushing behavior. - */ - if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) && - bytes_needed == U64_MAX) { - do_commit = true; - goto check_pinned; - } - spin_lock(&space_info->lock); cur_free_bytes = btrfs_space_info_used(space_info, true); if (cur_free_bytes < space_info->total_bytes) @@ -631,7 +619,6 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, if (!bytes_needed) return 0; -check_pinned: trans = btrfs_join_transaction(fs_info->extent_root); if (IS_ERR(trans)) return PTR_ERR(trans); @@ -641,8 +628,7 @@ static int may_commit_transaction(struct btrfs_fs_info *fs_info, * we have block groups that are going to be freed, allowing us to * possibly do a chunk allocation the next loop through. */ - if (do_commit || - test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || + if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags) || __percpu_counter_compare(&space_info->total_bytes_pinned, bytes_needed, BTRFS_TOTAL_BYTES_PINNED_BATCH) >= 0) @@ -759,7 +745,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, btrfs_wait_on_delayed_iputs(fs_info); break; case COMMIT_TRANS: - ret = may_commit_transaction(fs_info, space_info, num_bytes); + ret = may_commit_transaction(fs_info, space_info); break; default: ret = -ENOSPC; From 327feeeb2e9bc68d2af1c581394ec6706f600bb9 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:31 -0400 Subject: [PATCH 131/271] btrfs: run delayed iputs before committing the transaction for data Before we were waiting on iputs after we committed the transaction, but this doesn't really make much sense. We want to reclaim any space we may have in order to be more likely to commit the transaction, due to pinned space being added by running the delayed iputs. Fix this by making delayed iputs run before committing the transaction. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index c229dffa46d8..6d82c444ceff 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1022,8 +1022,8 @@ static const enum btrfs_flush_state evict_flush_states[] = { static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, - COMMIT_TRANS, RUN_DELAYED_IPUTS, + COMMIT_TRANS, }; static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, From cb3e3930459972dbafc274bcde3fdc1462429391 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:32 -0400 Subject: [PATCH 132/271] btrfs: flush delayed refs when trying to reserve data space We can end up with freed extents in the delayed refs, and thus may_commit_transaction() may not think we have enough pinned space to commit the transaction and we'll ENOSPC early. Handle this by running the delayed refs in order to make sure pinned is uptodate before we try to commit the transaction. Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 6d82c444ceff..8496518297f3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1023,6 +1023,7 @@ static const enum btrfs_flush_state evict_flush_states[] = { static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, COMMIT_TRANS, }; From 5705674081cee751659a6adb4849645a566a6cf4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:33 -0400 Subject: [PATCH 133/271] btrfs: do async reclaim for data reservations Now that we have the data ticketing stuff in place, move normal data reservations to use an async reclaim helper to satisfy tickets. Before we could have multiple tasks race in and both allocate chunks, resulting in more data chunks than we would necessarily need. Serializing these allocations and making a single thread responsible for flushing will only allocate chunks as needed, as well as cut down on transaction commits and other flush related activities. Priority reservations will still work as they have before, simply trying to allocate a chunk until they can make their reservation. Reviewed-by: Nikolay Borisov Tested-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 3 +- fs/btrfs/disk-io.c | 3 +- fs/btrfs/space-info.c | 117 ++++++++++++++++++++++++++++++------------ fs/btrfs/super.c | 1 + 4 files changed, 89 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 098f006fc5a3..f2d4c14fc273 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -494,7 +494,7 @@ enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_DONE = 2, }; -void btrfs_init_async_reclaim_work(struct work_struct *work); +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); /* fs_info */ struct reloc_control; @@ -912,6 +912,7 @@ struct btrfs_fs_info { /* Used to reclaim the metadata space in the background. */ struct work_struct async_reclaim_work; + struct work_struct async_data_reclaim_work; spinlock_t unused_bgs_lock; struct list_head unused_bgs; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index b1f6662f89c4..a496219baa39 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2753,7 +2753,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->check_integrity_print_mask = 0; #endif btrfs_init_balance(fs_info); - btrfs_init_async_reclaim_work(&fs_info->async_reclaim_work); + btrfs_init_async_reclaim_work(fs_info); spin_lock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT; @@ -4056,6 +4056,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 8496518297f3..bbddb8b5eff1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -998,9 +998,79 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } -void btrfs_init_async_reclaim_work(struct work_struct *work) +static const enum btrfs_flush_state data_flush_states[] = { + FLUSH_DELALLOC_WAIT, + RUN_DELAYED_IPUTS, + FLUSH_DELAYED_REFS, + COMMIT_TRANS, +}; + +static void btrfs_async_reclaim_data_space(struct work_struct *work) { - INIT_WORK(work, btrfs_async_reclaim_metadata_space); + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + u64 last_tickets_id; + int flush_state = 0; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + + while (!space_info->full) { + flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + last_tickets_id = space_info->tickets_id; + spin_unlock(&space_info->lock); + } + + while (flush_state < ARRAY_SIZE(data_flush_states)) { + flush_space(fs_info, space_info, U64_MAX, + data_flush_states[flush_state]); + spin_lock(&space_info->lock); + if (list_empty(&space_info->tickets)) { + space_info->flush = 0; + spin_unlock(&space_info->lock); + return; + } + + if (last_tickets_id == space_info->tickets_id) { + flush_state++; + } else { + last_tickets_id = space_info->tickets_id; + flush_state = 0; + } + + if (flush_state >= ARRAY_SIZE(data_flush_states)) { + if (space_info->full) { + if (maybe_fail_all_tickets(fs_info, space_info)) + flush_state = 0; + else + space_info->flush = 0; + } else { + flush_state = 0; + } + } + spin_unlock(&space_info->lock); + } +} + +void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) +{ + INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); + INIT_WORK(&fs_info->async_data_reclaim_work, btrfs_async_reclaim_data_space); } static const enum btrfs_flush_state priority_flush_states[] = { @@ -1020,13 +1090,6 @@ static const enum btrfs_flush_state evict_flush_states[] = { COMMIT_TRANS, }; -static const enum btrfs_flush_state data_flush_states[] = { - FLUSH_DELALLOC_WAIT, - RUN_DELAYED_IPUTS, - FLUSH_DELAYED_REFS, - COMMIT_TRANS, -}; - static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, struct reserve_ticket *ticket, @@ -1059,12 +1122,8 @@ static void priority_reclaim_metadata_space(struct btrfs_fs_info *fs_info, static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, - struct reserve_ticket *ticket, - const enum btrfs_flush_state *states, - int states_nr) + struct reserve_ticket *ticket) { - int flush_state = 0; - while (!space_info->full) { flush_space(fs_info, space_info, U64_MAX, ALLOC_CHUNK_FORCE); spin_lock(&space_info->lock); @@ -1074,17 +1133,6 @@ static void priority_reclaim_data_space(struct btrfs_fs_info *fs_info, } spin_unlock(&space_info->lock); } - - while (flush_state < states_nr) { - flush_space(fs_info, space_info, U64_MAX, states[flush_state]); - spin_lock(&space_info->lock); - if (ticket->bytes == 0) { - spin_unlock(&space_info->lock); - return; - } - spin_unlock(&space_info->lock); - flush_state++; - } } static void wait_reserve_ticket(struct btrfs_fs_info *fs_info, @@ -1139,6 +1187,7 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, int ret; switch (flush) { + case BTRFS_RESERVE_FLUSH_DATA: case BTRFS_RESERVE_FLUSH_ALL: case BTRFS_RESERVE_FLUSH_ALL_STEAL: wait_reserve_ticket(fs_info, space_info, ticket); @@ -1153,12 +1202,8 @@ static int handle_reserve_ticket(struct btrfs_fs_info *fs_info, evict_flush_states, ARRAY_SIZE(evict_flush_states)); break; - case BTRFS_RESERVE_FLUSH_DATA: - priority_reclaim_data_space(fs_info, space_info, ticket, - data_flush_states, ARRAY_SIZE(data_flush_states)); - break; case BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE: - priority_reclaim_data_space(fs_info, space_info, ticket, NULL, 0); + priority_reclaim_data_space(fs_info, space_info, ticket); break; default: ASSERT(0); @@ -1223,6 +1268,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, u64 orig_bytes, enum btrfs_reserve_flush_enum flush) { + struct work_struct *async_work; struct reserve_ticket ticket; u64 used; int ret = 0; @@ -1231,6 +1277,11 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, ASSERT(orig_bytes); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_ALL); + if (flush == BTRFS_RESERVE_FLUSH_DATA) + async_work = &fs_info->async_data_reclaim_work; + else + async_work = &fs_info->async_reclaim_work; + spin_lock(&space_info->lock); ret = -ENOSPC; used = btrfs_space_info_used(space_info, true); @@ -1272,7 +1323,8 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, init_waitqueue_head(&ticket.wait); ticket.steal = (flush == BTRFS_RESERVE_FLUSH_ALL_STEAL); if (flush == BTRFS_RESERVE_FLUSH_ALL || - flush == BTRFS_RESERVE_FLUSH_ALL_STEAL) { + flush == BTRFS_RESERVE_FLUSH_ALL_STEAL || + flush == BTRFS_RESERVE_FLUSH_DATA) { list_add_tail(&ticket.list, &space_info->tickets); if (!space_info->flush) { space_info->flush = 1; @@ -1280,8 +1332,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info, space_info->flags, orig_bytes, flush, "enospc"); - queue_work(system_unbound_wq, - &fs_info->async_reclaim_work); + queue_work(system_unbound_wq, async_work); } } else { list_add_tail(&ticket.list, diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 25967ecaaf0a..3ebe7240c63d 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -1871,6 +1871,7 @@ static int btrfs_remount(struct super_block *sb, int *flags, char *data) * the filesystem is busy. */ cancel_work_sync(&fs_info->async_reclaim_work); + cancel_work_sync(&fs_info->async_data_reclaim_work); btrfs_discard_cleanup(fs_info); From 1a7a92c8ddcd1edc4a5407de8f56edc6cfdf394a Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 21 Jul 2020 10:22:34 -0400 Subject: [PATCH 134/271] btrfs: add a comment explaining the data flush steps The data flushing steps are not obvious to people other than myself and Chris. Write a giant comment explaining the reasoning behind each flush step for data as well as why it is in that particular order. Reviewed-by: Johannes Thumshirn Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 47 +++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index bbddb8b5eff1..71aa9e0de61e 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -998,6 +998,53 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } while (flush_state <= COMMIT_TRANS); } +/* + * FLUSH_DELALLOC_WAIT: + * Space is freed from flushing delalloc in one of two ways. + * + * 1) compression is on and we allocate less space than we reserved + * 2) we are overwriting existing space + * + * For #1 that extra space is reclaimed as soon as the delalloc pages are + * COWed, by way of btrfs_add_reserved_bytes() which adds the actual extent + * length to ->bytes_reserved, and subtracts the reserved space from + * ->bytes_may_use. + * + * For #2 this is trickier. Once the ordered extent runs we will drop the + * extent in the range we are overwriting, which creates a delayed ref for + * that freed extent. This however is not reclaimed until the transaction + * commits, thus the next stages. + * + * RUN_DELAYED_IPUTS + * If we are freeing inodes, we want to make sure all delayed iputs have + * completed, because they could have been on an inode with i_nlink == 0, and + * thus have been truncated and freed up space. But again this space is not + * immediately re-usable, it comes in the form of a delayed ref, which must be + * run and then the transaction must be committed. + * + * FLUSH_DELAYED_REFS + * The above two cases generate delayed refs that will affect + * ->total_bytes_pinned. However this counter can be inconsistent with + * reality if there are outstanding delayed refs. This is because we adjust + * the counter based solely on the current set of delayed refs and disregard + * any on-disk state which might include more refs. So for example, if we + * have an extent with 2 references, but we only drop 1, we'll see that there + * is a negative delayed ref count for the extent and assume that the space + * will be freed, and thus increase ->total_bytes_pinned. + * + * Running the delayed refs gives us the actual real view of what will be + * freed at the transaction commit time. This stage will not actually free + * space for us, it just makes sure that may_commit_transaction() has all of + * the information it needs to make the right decision. + * + * COMMIT_TRANS + * This is where we reclaim all of the pinned space generated by the previous + * two stages. We will not commit the transaction if we don't think we're + * likely to satisfy our request, which means if our current free space + + * total_bytes_pinned < reservation we will not commit. This is why the + * previous states are actually important, to make sure we know for sure + * whether committing the transaction will allow us to make progress. + */ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, From c4923027bd58cdccbe54e13298a4d914668a56da Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 25 Aug 2020 16:56:59 -0400 Subject: [PATCH 135/271] btrfs: fix possible infinite loop in data async reclaim Dave reported an issue where generic/102 would sometimes hang. This turned out to be because we'd get into this spot where we were no longer making progress on data reservations because our exit condition was not met. The log is basically while (!space_info->full && !list_empty(&space_info->tickets)) flush_space(space_info, flush_state); where flush state is our various flush states, but doesn't include ALLOC_CHUNK_FORCE. This is because we actually lead with allocating chunks, and so the assumption was that once you got to the actual flushing states you could no longer allocate chunks. This was a stupid assumption, because you could have deleted block groups that would be reclaimed by a transaction commit, thus unsetting space_info->full. This is essentially what happens with generic/102, and so sometimes you'd get stuck in the flushing loop because we weren't allocating chunks, but flushing space wasn't giving us what we needed to make progress. Fix this by adding ALLOC_CHUNK_FORCE to the end of our flushing states, that way we will eventually bail out because we did end up with space_info->full if we free'd a chunk previously. Otherwise, as is the case for this test, we'll allocate our chunk and continue on our happy merry way. Reported-by: David Sterba Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 71aa9e0de61e..b733718f45d3 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1044,12 +1044,18 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) * total_bytes_pinned < reservation we will not commit. This is why the * previous states are actually important, to make sure we know for sure * whether committing the transaction will allow us to make progress. + * + * ALLOC_CHUNK_FORCE + * For data we start with alloc chunk force, however we could have been full + * before, and then the transaction commit could have freed new block groups, + * so if we now have space to allocate do the force chunk allocation. */ static const enum btrfs_flush_state data_flush_states[] = { FLUSH_DELALLOC_WAIT, RUN_DELAYED_IPUTS, FLUSH_DELAYED_REFS, COMMIT_TRANS, + ALLOC_CHUNK_FORCE, }; static void btrfs_async_reclaim_data_space(struct work_struct *work) From e21139c621ada48f2bc382865ea34dd49d45c2a8 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 13 Aug 2020 14:33:52 +0800 Subject: [PATCH 136/271] btrfs: cleanup calculation of lockend in lock_and_cleanup_extent_if_need() We're just doing rounding up to sectorsize to calculate the lockend. There is no need to do the unnecessary length calculation, just direct round_up() is enough. Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4507c3d09399..de14ed402390 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1477,9 +1477,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, int ret = 0; start_pos = round_down(pos, fs_info->sectorsize); - last_pos = start_pos - + round_up(pos + write_bytes - start_pos, - fs_info->sectorsize) - 1; + last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1; if (start_pos < inode->vfs_inode.i_size) { struct btrfs_ordered_extent *ordered; From 9e6df7cedfdf3e2469aa46fb772ca70b422132ec Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 10:56:00 +0200 Subject: [PATCH 137/271] btrfs: remove const from btrfs_feature_set_name The function btrfs_feature_set_name returns a const char pointer, the second const is not necessary and reported as a warning: In file included from fs/btrfs/space-info.c:6: fs/btrfs/sysfs.h:16:1: warning: type qualifiers ignored on function return type [-Wignored-qualifiers] 16 | const char * const btrfs_feature_set_name(enum btrfs_feature_set set); | ^~~~~ Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 2 +- fs/btrfs/sysfs.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index d7c58c89fb9c..fcc8757b4e93 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -973,7 +973,7 @@ static const char * const btrfs_feature_set_names[FEAT_MAX] = { [FEAT_INCOMPAT] = "incompat", }; -const char * const btrfs_feature_set_name(enum btrfs_feature_set set) +const char *btrfs_feature_set_name(enum btrfs_feature_set set) { return btrfs_feature_set_names[set]; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index c9efa15f96e0..4217823e255c 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -13,7 +13,7 @@ enum btrfs_feature_set { }; char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); -const char * const btrfs_feature_set_name(enum btrfs_feature_set set); +const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, From cb4c9198302b02c3f0eaf4267636e5ce1f4a4765 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 10:58:38 +0200 Subject: [PATCH 138/271] btrfs: compression: move declarations to header The declarations of compression algorithm callbacks are defined in the .c file as they're used from there. Compiler warns that there are no declarations for public functions when compiling lzo.c/zlib.c/zstd.c. Fix that by moving the declarations to the header as it's the common place for all of them. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/compression.c | 35 ----------------------------------- fs/btrfs/compression.h | 35 +++++++++++++++++++++++++++++++++++ 2 files changed, 35 insertions(+), 35 deletions(-) diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 1ab56a734e70..eeface30facd 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -29,41 +29,6 @@ #include "extent_io.h" #include "extent_map.h" -int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zlib_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *zlib_alloc_workspace(unsigned int level); -void zlib_free_workspace(struct list_head *ws); -struct list_head *zlib_get_workspace(unsigned int level); - -int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int lzo_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -struct list_head *lzo_alloc_workspace(unsigned int level); -void lzo_free_workspace(struct list_head *ws); - -int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, - u64 start, struct page **pages, unsigned long *out_pages, - unsigned long *total_in, unsigned long *total_out); -int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); -int zstd_decompress(struct list_head *ws, unsigned char *data_in, - struct page *dest_page, unsigned long start_byte, size_t srclen, - size_t destlen); -void zstd_init_workspace_manager(void); -void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); -void zstd_put_workspace(struct list_head *ws); - static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; const char* btrfs_compress_type2str(enum btrfs_compression_type type) diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 9f3dbe372631..8001b700ea3a 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -144,4 +144,39 @@ bool btrfs_compress_is_valid_type(const char *str, size_t len); int btrfs_compress_heuristic(struct inode *inode, u64 start, u64 end); +int zlib_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zlib_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *zlib_alloc_workspace(unsigned int level); +void zlib_free_workspace(struct list_head *ws); +struct list_head *zlib_get_workspace(unsigned int level); + +int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int lzo_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +struct list_head *lzo_alloc_workspace(unsigned int level); +void lzo_free_workspace(struct list_head *ws); + +int zstd_compress_pages(struct list_head *ws, struct address_space *mapping, + u64 start, struct page **pages, unsigned long *out_pages, + unsigned long *total_in, unsigned long *total_out); +int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb); +int zstd_decompress(struct list_head *ws, unsigned char *data_in, + struct page *dest_page, unsigned long start_byte, size_t srclen, + size_t destlen); +void zstd_init_workspace_manager(void); +void zstd_cleanup_workspace_manager(void); +struct list_head *zstd_alloc_workspace(unsigned int level); +void zstd_free_workspace(struct list_head *ws); +struct list_head *zstd_get_workspace(unsigned int level); +void zstd_put_workspace(struct list_head *ws); + #endif From 0af447d0507b1623cd4b94ad941460951e141783 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 12:08:37 +0200 Subject: [PATCH 139/271] btrfs: remove unnecessarily shadowed variables In btrfs_orphan_cleanup, there's another instance of fs_info, but it's the same as the one we already have. In btrfs_backref_finish_upper_links, rb_node is same type and used as temporary cursor to the tree. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/backref.c | 1 - fs/btrfs/inode.c | 1 - 2 files changed, 2 deletions(-) diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index ea1c28ccb44f..b3268f4ea5f3 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -2997,7 +2997,6 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, while (!list_empty(&pending_edge)) { struct btrfs_backref_node *upper; struct btrfs_backref_node *lower; - struct rb_node *rb_node; edge = list_first_entry(&pending_edge, struct btrfs_backref_edge, list[UPPER]); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 48b8b5f09f47..50d9c335271a 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3055,7 +3055,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) if (ret == -ENOENT && root == fs_info->tree_root) { struct btrfs_root *dead_root; - struct btrfs_fs_info *fs_info = root->fs_info; int is_dead_root = 0; /* From 8bb1cf1ba639d271983be29db68d30e2eebabb98 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 12:12:38 +0200 Subject: [PATCH 140/271] btrfs: scrub: rename ratelimit state varaible to avoid shadowing There's already defined _rs within ctree.h:btrfs_printk_ratelimited, local variables should not use _ to avoid such name clashes with macro-local variables. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/scrub.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 354ab9985a34..cf63f1e27a27 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -835,7 +835,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) int success; bool full_stripe_locked; unsigned int nofs_flag; - static DEFINE_RATELIMIT_STATE(_rs, DEFAULT_RATELIMIT_INTERVAL, + static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); BUG_ON(sblock_to_check->page_count < 1); @@ -969,14 +969,14 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.read_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("i/o error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS); } else if (sblock_bad->checksum_error) { spin_lock(&sctx->stat_lock); sctx->stat.csum_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum error", sblock_to_check); btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -984,7 +984,7 @@ static int scrub_handle_errored_block(struct scrub_block *sblock_to_check) spin_lock(&sctx->stat_lock); sctx->stat.verify_errors++; spin_unlock(&sctx->stat_lock); - if (__ratelimit(&_rs)) + if (__ratelimit(&rs)) scrub_print_warning("checksum/header error", sblock_to_check); if (sblock_bad->generation_error) From 1b51d6fce45ec1f2279f06f57fe1c8a703437b27 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Mon, 17 Aug 2020 12:16:57 +0200 Subject: [PATCH 141/271] btrfs: send: remove indirect callback parameter for changed_cb There's a custom callback passed to btrfs_compare_trees which happens to be named exactly same as the existing function implementing it. This is confusing and the indirection is not necessary for our needs. Compiler is clever enough to call it directly so there's effectively no change. Reviewed-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index d9813a5b075a..7c7c09fc65e8 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -278,11 +278,6 @@ enum btrfs_compare_tree_result { BTRFS_COMPARE_TREE_CHANGED, BTRFS_COMPARE_TREE_SAME, }; -typedef int (*btrfs_changed_cb_t)(struct btrfs_path *left_path, - struct btrfs_path *right_path, - struct btrfs_key *key, - enum btrfs_compare_tree_result result, - void *ctx); __cold static void inconsistent_snapshot_error(struct send_ctx *sctx, @@ -6692,8 +6687,7 @@ static int tree_compare_item(struct btrfs_path *left_path, * If it detects a change, it aborts immediately. */ static int btrfs_compare_trees(struct btrfs_root *left_root, - struct btrfs_root *right_root, - btrfs_changed_cb_t changed_cb, void *ctx) + struct btrfs_root *right_root, void *ctx) { struct btrfs_fs_info *fs_info = left_root->fs_info; int ret; @@ -6960,8 +6954,7 @@ static int send_subvol(struct send_ctx *sctx) goto out; if (sctx->parent_root) { - ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, - changed_cb, sctx); + ret = btrfs_compare_trees(sctx->send_root, sctx->parent_root, sctx); if (ret < 0) goto out; ret = finish_inode_if_needed(sctx, 1); From 5522a27e59c62b3fb60448eb6c3640cf307247bb Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:37 +0100 Subject: [PATCH 142/271] btrfs: do not take the log_mutex of the subvolume when pinning the log During a rename we pin the log to make sure no one commits a log that reflects an ongoing rename operation, as it might result in a committed log where it recorded the unlink of the old name without having recorded the new name. However we are taking the subvolume's log_mutex before incrementing the log_writers counter, which is not necessary since that counter is atomic and we only remove the old name from the log and add the new name to the log after we have incremented log_writers, ensuring that no one can commit the log after we have removed the old name from the log and before we added the new name to the log. By taking the log_mutex lock we are just adding unnecessary contention on the lock, which can become visible for workloads that mix renames with fsyncs, writes for files opened with O_SYNC and unlink operations (if the inode or its parent were fsynced before in the current transaction). So just remove the lock and unlock of the subvolume's log_mutex at btrfs_pin_log_trans(). Using dbench, which mixes different types of operations that end up taking that mutex (fsyncs, renames, unlinks and writes into files opened with O_SYNC) revealed some small gains. The following script that calls dbench was used: #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-m single -d single" THREADS=32 echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -s -t 600 -D $MNT $THREADS umount $MNT The test was run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). Results before this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 4410848 0.017 738.640 Close 3240222 0.001 0.834 Rename 186850 7.478 1272.476 Unlink 890875 0.128 785.018 Deltree 128 2.846 12.081 Mkdir 64 0.002 0.003 Qpathinfo 3997659 0.009 11.171 Qfileinfo 701307 0.001 0.478 Qfsinfo 733494 0.002 1.103 Sfileinfo 359362 0.004 3.266 Find 1546226 0.041 4.128 WriteX 2202803 7.905 1376.989 ReadX 6917775 0.003 3.887 LockX 14392 0.002 0.043 UnlockX 14392 0.001 0.085 Flush 309225 0.128 1033.936 Throughput 231.555 MB/sec (sync open) 32 clients 32 procs max_latency=1376.993 ms Results after this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 4603244 0.017 232.776 Close 3381299 0.001 1.041 Rename 194871 7.251 1073.165 Unlink 929730 0.133 119.233 Deltree 128 2.871 10.199 Mkdir 64 0.002 0.004 Qpathinfo 4171343 0.009 11.317 Qfileinfo 731227 0.001 1.635 Qfsinfo 765079 0.002 3.568 Sfileinfo 374881 0.004 1.220 Find 1612964 0.041 4.675 WriteX 2296720 7.569 1178.204 ReadX 7213633 0.003 3.075 LockX 14976 0.002 0.076 UnlockX 14976 0.001 0.061 Flush 322635 0.102 579.505 Throughput 241.4 MB/sec (sync open) 32 clients 32 procs max_latency=1178.207 ms (+4.3% throughput, -14.4% max latency) Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index a6e69378b706..4c3565c282e2 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -215,9 +215,7 @@ static int join_running_log_trans(struct btrfs_root *root) */ void btrfs_pin_log_trans(struct btrfs_root *root) { - mutex_lock(&root->log_mutex); atomic_inc(&root->log_writers); - mutex_unlock(&root->log_mutex); } /* From 75b463d2b47aef96fe1dc3e0237629963034764b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:48 +0100 Subject: [PATCH 143/271] btrfs: do not commit logs and transactions during link and rename operations Since commit d4682ba03ef618 ("Btrfs: sync log after logging new name") we started to commit logs, and fallback to transaction commits when we failed to log the new names or commit the logs, after link and rename operations when the target inodes (or their parents) were previously logged in the current transaction. This was to avoid losing directories despite an explicit fsync on them when they are ancestors of some inode that got a new named logged, due to a link or rename operation. However that adds the cost of starting IO and waiting for it to complete, which can cause higher latencies for applications. Instead of doing that, just make sure that when we log a new name for an inode we don't mark any of its ancestors as logged, so that if any one does an fsync against any of them, without doing any other change on them, the fsync commits the log. This way we only pay the cost of a log commit (or a transaction commit if something goes wrong or a new block group was created) if the application explicitly asks to fsync any of the parent directories. Using dbench, which mixes several filesystems operations including renames, revealed some significant latency gains. The following script that uses dbench was used to test this: #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-m single -d single" THREADS=16 echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT dbench -t 300 -D $MNT $THREADS umount $MNT The test was run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and using a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). Results before this patch: Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 10750455 0.011 155.088 Close 7896674 0.001 0.243 Rename 455222 2.158 1101.947 Unlink 2171189 0.067 121.638 Deltree 256 2.425 7.816 Mkdir 128 0.002 0.003 Qpathinfo 9744323 0.006 21.370 Qfileinfo 1707092 0.001 0.146 Qfsinfo 1786756 0.001 11.228 Sfileinfo 875612 0.003 21.263 Find 3767281 0.025 9.617 WriteX 5356924 0.011 211.390 ReadX 16852694 0.003 9.442 LockX 35008 0.002 0.119 UnlockX 35008 0.001 0.138 Flush 753458 4.252 1102.249 Throughput 1128.35 MB/sec 16 clients 16 procs max_latency=1102.255 ms Results after this patch: 16 clients, after Operation Count AvgLat MaxLat ---------------------------------------- NTCreateX 11471098 0.012 448.281 Close 8426396 0.001 0.925 Rename 485746 0.123 267.183 Unlink 2316477 0.080 63.433 Deltree 288 2.830 11.144 Mkdir 144 0.003 0.010 Qpathinfo 10397420 0.006 10.288 Qfileinfo 1822039 0.001 0.169 Qfsinfo 1906497 0.002 14.039 Sfileinfo 934433 0.004 2.438 Find 4019879 0.026 10.200 WriteX 5718932 0.011 200.985 ReadX 17981671 0.003 10.036 LockX 37352 0.002 0.076 UnlockX 37352 0.001 0.109 Flush 804018 5.015 778.033 Throughput 1201.98 MB/sec 16 clients 16 procs max_latency=778.036 ms (+6.5% throughput, -29.4% max latency, -75.8% rename latency) Test case generic/498 from fstests tests the scenario that the previously mentioned commit fixed. Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/inode.c | 115 +++++--------------------------------------- fs/btrfs/tree-log.c | 100 +++++++++++++++++--------------------- fs/btrfs/tree-log.h | 14 ++---- 3 files changed, 60 insertions(+), 169 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 50d9c335271a..e8a35db53369 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6373,7 +6373,6 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, drop_inode = 1; } else { struct dentry *parent = dentry->d_parent; - int ret; err = btrfs_update_inode(trans, root, inode); if (err) @@ -6388,12 +6387,7 @@ static int btrfs_link(struct dentry *old_dentry, struct inode *dir, goto fail; } d_instantiate(dentry, inode); - ret = btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent, - true, NULL); - if (ret == BTRFS_NEED_TRANS_COMMIT) { - err = btrfs_commit_transaction(trans); - trans = NULL; - } + btrfs_log_new_name(trans, BTRFS_I(inode), NULL, parent); } fail: @@ -8778,27 +8772,19 @@ static int btrfs_rename_exchange(struct inode *old_dir, struct inode *new_inode = new_dentry->d_inode; struct inode *old_inode = old_dentry->d_inode; struct timespec64 ctime = current_time(old_inode); - struct dentry *parent; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); u64 new_ino = btrfs_ino(BTRFS_I(new_inode)); u64 old_idx = 0; u64 new_idx = 0; int ret; + int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; - struct btrfs_log_ctx ctx_root; - struct btrfs_log_ctx ctx_dest; - bool sync_log_root = false; - bool sync_log_dest = false; - bool commit_transaction = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) return -EXDEV; - btrfs_init_log_ctx(&ctx_root, old_inode); - btrfs_init_log_ctx(&ctx_dest, new_inode); - /* close the race window with snapshot create/destroy ioctl */ if (old_ino == BTRFS_FIRST_FREE_OBJECTID || new_ino == BTRFS_FIRST_FREE_OBJECTID) @@ -8940,30 +8926,14 @@ static int btrfs_rename_exchange(struct inode *old_dir, BTRFS_I(new_inode)->dir_index = new_idx; if (root_log_pinned) { - parent = new_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx_root); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_root = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); root_log_pinned = false; } if (dest_log_pinned) { - if (!commit_transaction) { - parent = old_dentry->d_parent; - ret = btrfs_log_new_name(trans, BTRFS_I(new_inode), - BTRFS_I(new_dir), parent, - false, &ctx_dest); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log_dest = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; - } + btrfs_log_new_name(trans, BTRFS_I(new_inode), BTRFS_I(new_dir), + old_dentry->d_parent); btrfs_end_log_trans(dest); dest_log_pinned = false; } @@ -8996,46 +8966,13 @@ static int btrfs_rename_exchange(struct inode *old_dir, dest_log_pinned = false; } } - if (!ret && sync_log_root && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, - &ctx_root); - if (ret) - commit_transaction = true; - } - if (!ret && sync_log_dest && !commit_transaction) { - ret = btrfs_sync_log(trans, BTRFS_I(new_inode)->root, - &ctx_dest); - if (ret) - commit_transaction = true; - } - if (commit_transaction) { - /* - * We may have set commit_transaction when logging the new name - * in the destination root, in which case we left the source - * root context in the list of log contextes. So make sure we - * remove it to avoid invalid memory accesses, since the context - * was allocated in our stack frame. - */ - if (sync_log_root) { - mutex_lock(&root->log_mutex); - list_del_init(&ctx_root.list); - mutex_unlock(&root->log_mutex); - } - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (new_ino == BTRFS_FIRST_FREE_OBJECTID || old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); - ASSERT(list_empty(&ctx_root.list)); - ASSERT(list_empty(&ctx_dest.list)); - return ret; } @@ -9103,11 +9040,9 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, struct inode *old_inode = d_inode(old_dentry); u64 index = 0; int ret; + int ret2; u64 old_ino = btrfs_ino(BTRFS_I(old_inode)); bool log_pinned = false; - struct btrfs_log_ctx ctx; - bool sync_log = false; - bool commit_transaction = false; if (btrfs_ino(BTRFS_I(new_dir)) == BTRFS_EMPTY_SUBVOL_DIR_OBJECTID) return -EPERM; @@ -9257,17 +9192,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, BTRFS_I(old_inode)->dir_index = index; if (log_pinned) { - struct dentry *parent = new_dentry->d_parent; - - btrfs_init_log_ctx(&ctx, old_inode); - ret = btrfs_log_new_name(trans, BTRFS_I(old_inode), - BTRFS_I(old_dir), parent, - false, &ctx); - if (ret == BTRFS_NEED_LOG_SYNC) - sync_log = true; - else if (ret == BTRFS_NEED_TRANS_COMMIT) - commit_transaction = true; - ret = 0; + btrfs_log_new_name(trans, BTRFS_I(old_inode), BTRFS_I(old_dir), + new_dentry->d_parent); btrfs_end_log_trans(root); log_pinned = false; } @@ -9304,23 +9230,8 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, btrfs_end_log_trans(root); log_pinned = false; } - if (!ret && sync_log) { - ret = btrfs_sync_log(trans, BTRFS_I(old_inode)->root, &ctx); - if (ret) - commit_transaction = true; - } else if (sync_log) { - mutex_lock(&root->log_mutex); - list_del(&ctx.list); - mutex_unlock(&root->log_mutex); - } - if (commit_transaction) { - ret = btrfs_commit_transaction(trans); - } else { - int ret2; - - ret2 = btrfs_end_transaction(trans); - ret = ret ? ret : ret2; - } + ret2 = btrfs_end_transaction(trans); + ret = ret ? ret : ret2; out_notrans: if (old_ino == BTRFS_FIRST_FREE_OBJECTID) up_read(&fs_info->subvol_sem); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 4c3565c282e2..e86fe7bbba82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -176,7 +176,7 @@ static int start_log_trans(struct btrfs_trans_handle *trans, atomic_inc(&root->log_batch); atomic_inc(&root->log_writers); - if (ctx) { + if (ctx && !ctx->logging_new_name) { int index = root->log_transid % 2; list_add_tail(&ctx->list, &root->log_ctxs[index]); ctx->log_transid = root->log_transid; @@ -5337,19 +5337,34 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } /* - * Don't update last_log_commit if we logged that an inode exists after - * it was loaded to memory (full_sync bit set). - * This is to prevent data loss when we do a write to the inode, then - * the inode gets evicted after all delalloc was flushed, then we log - * it exists (due to a rename for example) and then fsync it. This last - * fsync would do nothing (not logging the extents previously written). + * If we are logging that an ancestor inode exists as part of logging a + * new name from a link or rename operation, don't mark the inode as + * logged - otherwise if an explicit fsync is made against an ancestor, + * the fsync considers the inode in the log and doesn't sync the log, + * resulting in the ancestor missing after a power failure unless the + * log was synced as part of an fsync against any other unrelated inode. + * So keep it simple for this case and just don't flag the ancestors as + * logged. */ - spin_lock(&inode->lock); - inode->logged_trans = trans->transid; - if (inode_only != LOG_INODE_EXISTS || - !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) - inode->last_log_commit = inode->last_sub_trans; - spin_unlock(&inode->lock); + if (!ctx || + !(S_ISDIR(inode->vfs_inode.i_mode) && ctx->logging_new_name && + &inode->vfs_inode != ctx->inode)) { + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* + * Don't update last_log_commit if we logged that an inode exists + * after it was loaded to memory (full_sync bit set). + * This is to prevent data loss when we do a write to the inode, + * then the inode gets evicted after all delalloc was flushed, + * then we log it exists (due to a rename for example) and then + * fsync it. This last fsync would do nothing (not logging the + * extents previously written). + */ + if (inode_only != LOG_INODE_EXISTS || + !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } out_unlock: mutex_unlock(&inode->log_mutex); @@ -6369,26 +6384,13 @@ void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, /* * Call this after adding a new name for a file and it will properly * update the log to reflect the new name. - * - * @ctx can not be NULL when @sync_log is false, and should be NULL when it's - * true (because it's not used). - * - * Return value depends on whether @sync_log is true or false. - * When true: returns BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed by the caller, and BTRFS_DONT_NEED_TRANS_COMMIT - * otherwise. - * When false: returns BTRFS_DONT_NEED_LOG_SYNC if the caller does not need to - * sync the log, BTRFS_NEED_LOG_SYNC if it needs to sync the log, - * or BTRFS_NEED_TRANS_COMMIT if the transaction needs to be - * committed (without attempting to sync the log). */ -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx) + struct dentry *parent) { struct btrfs_fs_info *fs_info = trans->fs_info; - int ret; + struct btrfs_log_ctx ctx; /* * this will force the logging code to walk the dentry chain @@ -6403,34 +6405,18 @@ int btrfs_log_new_name(struct btrfs_trans_handle *trans, */ if (inode->logged_trans <= fs_info->last_trans_committed && (!old_dir || old_dir->logged_trans <= fs_info->last_trans_committed)) - return sync_log ? BTRFS_DONT_NEED_TRANS_COMMIT : - BTRFS_DONT_NEED_LOG_SYNC; + return; - if (sync_log) { - struct btrfs_log_ctx ctx2; - - btrfs_init_log_ctx(&ctx2, &inode->vfs_inode); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx2); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_TRANS_COMMIT; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - ret = btrfs_sync_log(trans, inode->root, &ctx2); - if (ret) - return BTRFS_NEED_TRANS_COMMIT; - return BTRFS_DONT_NEED_TRANS_COMMIT; - } - - ASSERT(ctx); - ret = btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, ctx); - if (ret == BTRFS_NO_LOG_SYNC) - return BTRFS_DONT_NEED_LOG_SYNC; - else if (ret) - return BTRFS_NEED_TRANS_COMMIT; - - return BTRFS_NEED_LOG_SYNC; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); + ctx.logging_new_name = true; + /* + * We don't care about the return value. If we fail to log the new name + * then we know the next attempt to sync the log will fallback to a full + * transaction commit (due to a call to btrfs_set_log_full_commit()), so + * we don't need to worry about getting a log committed that has an + * inconsistent state after a rename operation. + */ + btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, + LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index 132e43d29034..ddfc6789d9bf 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -16,6 +16,7 @@ struct btrfs_log_ctx { int log_ret; int log_transid; bool log_new_dentries; + bool logging_new_name; struct inode *inode; struct list_head list; }; @@ -26,6 +27,7 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->log_ret = 0; ctx->log_transid = 0; ctx->log_new_dentries = false; + ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); } @@ -67,16 +69,8 @@ void btrfs_record_unlink_dir(struct btrfs_trans_handle *trans, int for_rename); void btrfs_record_snapshot_destroy(struct btrfs_trans_handle *trans, struct btrfs_inode *dir); -/* Return values for btrfs_log_new_name() */ -enum { - BTRFS_DONT_NEED_TRANS_COMMIT, - BTRFS_NEED_TRANS_COMMIT, - BTRFS_DONT_NEED_LOG_SYNC, - BTRFS_NEED_LOG_SYNC, -}; -int btrfs_log_new_name(struct btrfs_trans_handle *trans, +void btrfs_log_new_name(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_inode *old_dir, - struct dentry *parent, - bool sync_log, struct btrfs_log_ctx *ctx); + struct dentry *parent); #endif From 487781796d302266aff993bee17d4e1ddd73445b Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 11 Aug 2020 12:43:58 +0100 Subject: [PATCH 144/271] btrfs: make fast fsyncs wait only for writeback Currently regardless of a full or a fast fsync we always wait for ordered extents to complete, and then start logging the inode after that. However for fast fsyncs we can just wait for the writeback to complete, we don't need to wait for the ordered extents to complete since we use the list of modified extents maps to figure out which extents we must log and we can get their checksums directly from the ordered extents that are still in flight, otherwise look them up from the checksums tree. Until commit b5e6c3e170b770 ("btrfs: always wait on ordered extents at fsync time"), for fast fsyncs, we used to start logging without even waiting for the writeback to complete first, we would wait for it to complete after logging, while holding a transaction open, which lead to performance issues when using cgroups and probably for other cases too, as wait for IO while holding a transaction handle should be avoided as much as possible. After that, for fast fsyncs, we started to wait for ordered extents to complete before starting to log, which adds some latency to fsyncs and we even got at least one report about a performance drop which bisected to that particular change: https://lore.kernel.org/linux-btrfs/20181109215148.GF23260@techsingularity.net/ This change makes fast fsyncs only wait for writeback to finish before starting to log the inode, instead of waiting for both the writeback to finish and for the ordered extents to complete. This brings back part of the logic we had that extracts checksums from in flight ordered extents, which are not yet in the checksums tree, and making sure transaction commits wait for the completion of ordered extents previously logged (by far most of the time they have already completed by the time a transaction commit starts, resulting in no wait at all), to avoid any data loss if an ordered extent completes after the transaction used to log an inode is committed, followed by a power failure. When there are no other tasks accessing the checksums and the subvolume btrees, the ordered extent completion is pretty fast, typically taking 100 to 200 microseconds only in my observations. However when there are other tasks accessing these btrees, ordered extent completion can take a lot more time due to lock contention on nodes and leaves of these btrees. I've seen cases over 2 milliseconds, which starts to be significant. In particular when we do have concurrent fsyncs against different files there is a lot of contention on the checksums btree, since we have many tasks writing the checksums into the btree and other tasks that already started the logging phase are doing lookups for checksums in the btree. This change also turns all ranged fsyncs into full ranged fsyncs, which is something we already did when not using the NO_HOLES features or when doing a full fsync. This is to guarantee we never miss checksums due to writeback having been triggered only for a part of an extent, and we end up logging the full extent but only checksums for the written range, which results in missing checksums after log replay. Allowing ranged fsyncs to operate again only in the original range, when using the NO_HOLES feature and doing a fast fsync is doable but requires some non trivial changes to the writeback path, which can always be worked on later if needed, but I don't think they are a very common use case. Several tests were performed using fio for different numbers of concurrent jobs, each writing and fsyncing its own file, for both sequential and random file writes. The tests were run on bare metal, no virtualization, on a box with 12 cores (Intel i7-8700), 64Gb of RAM and a NVMe device, with a kernel configuration that is the default of typical distributions (debian in this case), without debug options enabled (kasan, kmemleak, slub debug, debug of page allocations, lock debugging, etc). The following script that calls fio was used: $ cat test-fsync.sh #!/bin/bash DEV=/dev/nvme0n1 MNT=/mnt/btrfs MOUNT_OPTIONS="-o ssd -o space_cache=v2" MKFS_OPTIONS="-d single -m single" if [ $# -ne 5 ]; then echo "Use $0 NUM_JOBS FILE_SIZE FSYNC_FREQ BLOCK_SIZE [write|randwrite]" exit 1 fi NUM_JOBS=$1 FILE_SIZE=$2 FSYNC_FREQ=$3 BLOCK_SIZE=$4 WRITE_MODE=$5 if [ "$WRITE_MODE" != "write" ] && [ "$WRITE_MODE" != "randwrite" ]; then echo "Invalid WRITE_MODE, must be 'write' or 'randwrite'" exit 1 fi cat < /tmp/fio-job.ini [writers] rw=$WRITE_MODE fsync=$FSYNC_FREQ fallocate=none group_reporting=1 direct=0 bs=$BLOCK_SIZE ioengine=sync size=$FILE_SIZE directory=$MNT numjobs=$NUM_JOBS EOF echo "performance" | tee /sys/devices/system/cpu/cpu*/cpufreq/scaling_governor echo echo "Using config:" echo cat /tmp/fio-job.ini echo umount $MNT &> /dev/null mkfs.btrfs -f $MKFS_OPTIONS $DEV mount $MOUNT_OPTIONS $DEV $MNT fio /tmp/fio-job.ini umount $MNT The results were the following: ************************* *** sequential writes *** ************************* ==== 1 job, 8GiB file, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=36.6MiB/s (38.4MB/s), 36.6MiB/s-36.6MiB/s (38.4MB/s-38.4MB/s), io=8192MiB (8590MB), run=223689-223689msec After patch: WRITE: bw=40.2MiB/s (42.1MB/s), 40.2MiB/s-40.2MiB/s (42.1MB/s-42.1MB/s), io=8192MiB (8590MB), run=203980-203980msec (+9.8%, -8.8% runtime) ==== 2 jobs, 4GiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=35.8MiB/s (37.5MB/s), 35.8MiB/s-35.8MiB/s (37.5MB/s-37.5MB/s), io=8192MiB (8590MB), run=228950-228950msec After patch: WRITE: bw=43.5MiB/s (45.6MB/s), 43.5MiB/s-43.5MiB/s (45.6MB/s-45.6MB/s), io=8192MiB (8590MB), run=188272-188272msec (+21.5% throughput, -17.8% runtime) ==== 4 jobs, 2GiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=50.1MiB/s (52.6MB/s), 50.1MiB/s-50.1MiB/s (52.6MB/s-52.6MB/s), io=8192MiB (8590MB), run=163446-163446msec After patch: WRITE: bw=64.5MiB/s (67.6MB/s), 64.5MiB/s-64.5MiB/s (67.6MB/s-67.6MB/s), io=8192MiB (8590MB), run=126987-126987msec (+28.7% throughput, -22.3% runtime) ==== 8 jobs, 1GiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=64.0MiB/s (68.1MB/s), 64.0MiB/s-64.0MiB/s (68.1MB/s-68.1MB/s), io=8192MiB (8590MB), run=126075-126075msec After patch: WRITE: bw=86.8MiB/s (91.0MB/s), 86.8MiB/s-86.8MiB/s (91.0MB/s-91.0MB/s), io=8192MiB (8590MB), run=94358-94358msec (+35.6% throughput, -25.2% runtime) ==== 16 jobs, 512MiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=79.8MiB/s (83.6MB/s), 79.8MiB/s-79.8MiB/s (83.6MB/s-83.6MB/s), io=8192MiB (8590MB), run=102694-102694msec After patch: WRITE: bw=107MiB/s (112MB/s), 107MiB/s-107MiB/s (112MB/s-112MB/s), io=8192MiB (8590MB), run=76446-76446msec (+34.1% throughput, -25.6% runtime) ==== 32 jobs, 512MiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=93.2MiB/s (97.7MB/s), 93.2MiB/s-93.2MiB/s (97.7MB/s-97.7MB/s), io=16.0GiB (17.2GB), run=175836-175836msec After patch: WRITE: bw=111MiB/s (117MB/s), 111MiB/s-111MiB/s (117MB/s-117MB/s), io=16.0GiB (17.2GB), run=147001-147001msec (+19.1% throughput, -16.4% runtime) ==== 64 jobs, 512MiB files, fsync frequency 1, block size 64KiB ==== Before patch: WRITE: bw=108MiB/s (114MB/s), 108MiB/s-108MiB/s (114MB/s-114MB/s), io=32.0GiB (34.4GB), run=302656-302656msec After patch: WRITE: bw=133MiB/s (140MB/s), 133MiB/s-133MiB/s (140MB/s-140MB/s), io=32.0GiB (34.4GB), run=246003-246003msec (+23.1% throughput, -18.7% runtime) ************************ *** random writes *** ************************ ==== 1 job, 8GiB file, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=11.5MiB/s (12.0MB/s), 11.5MiB/s-11.5MiB/s (12.0MB/s-12.0MB/s), io=8192MiB (8590MB), run=714281-714281msec After patch: WRITE: bw=11.6MiB/s (12.2MB/s), 11.6MiB/s-11.6MiB/s (12.2MB/s-12.2MB/s), io=8192MiB (8590MB), run=705959-705959msec (+0.9% throughput, -1.7% runtime) ==== 2 jobs, 4GiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=12.8MiB/s (13.5MB/s), 12.8MiB/s-12.8MiB/s (13.5MB/s-13.5MB/s), io=8192MiB (8590MB), run=638101-638101msec After patch: WRITE: bw=13.1MiB/s (13.7MB/s), 13.1MiB/s-13.1MiB/s (13.7MB/s-13.7MB/s), io=8192MiB (8590MB), run=625374-625374msec (+2.3% throughput, -2.0% runtime) ==== 4 jobs, 2GiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=15.4MiB/s (16.2MB/s), 15.4MiB/s-15.4MiB/s (16.2MB/s-16.2MB/s), io=8192MiB (8590MB), run=531146-531146msec After patch: WRITE: bw=17.8MiB/s (18.7MB/s), 17.8MiB/s-17.8MiB/s (18.7MB/s-18.7MB/s), io=8192MiB (8590MB), run=460431-460431msec (+15.6% throughput, -13.3% runtime) ==== 8 jobs, 1GiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=19.9MiB/s (20.8MB/s), 19.9MiB/s-19.9MiB/s (20.8MB/s-20.8MB/s), io=8192MiB (8590MB), run=412664-412664msec After patch: WRITE: bw=22.2MiB/s (23.3MB/s), 22.2MiB/s-22.2MiB/s (23.3MB/s-23.3MB/s), io=8192MiB (8590MB), run=368589-368589msec (+11.6% throughput, -10.7% runtime) ==== 16 jobs, 512MiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=29.3MiB/s (30.7MB/s), 29.3MiB/s-29.3MiB/s (30.7MB/s-30.7MB/s), io=8192MiB (8590MB), run=279924-279924msec After patch: WRITE: bw=30.4MiB/s (31.9MB/s), 30.4MiB/s-30.4MiB/s (31.9MB/s-31.9MB/s), io=8192MiB (8590MB), run=269258-269258msec (+3.8% throughput, -3.8% runtime) ==== 32 jobs, 512MiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=36.9MiB/s (38.7MB/s), 36.9MiB/s-36.9MiB/s (38.7MB/s-38.7MB/s), io=16.0GiB (17.2GB), run=443581-443581msec After patch: WRITE: bw=41.6MiB/s (43.6MB/s), 41.6MiB/s-41.6MiB/s (43.6MB/s-43.6MB/s), io=16.0GiB (17.2GB), run=394114-394114msec (+12.7% throughput, -11.2% runtime) ==== 64 jobs, 512MiB files, fsync frequency 16, block size 4KiB ==== Before patch: WRITE: bw=45.9MiB/s (48.1MB/s), 45.9MiB/s-45.9MiB/s (48.1MB/s-48.1MB/s), io=32.0GiB (34.4GB), run=714614-714614msec After patch: WRITE: bw=48.8MiB/s (51.1MB/s), 48.8MiB/s-48.8MiB/s (51.1MB/s-51.1MB/s), io=32.0GiB (34.4GB), run=672087-672087msec (+6.3% throughput, -6.0% runtime) Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 5 ++ fs/btrfs/file.c | 97 ++++++++++++++-------- fs/btrfs/ordered-data.c | 59 ++++++++++++++ fs/btrfs/ordered-data.h | 11 +++ fs/btrfs/transaction.c | 10 +++ fs/btrfs/transaction.h | 7 ++ fs/btrfs/tree-log.c | 176 ++++++++++++++++++++++++---------------- fs/btrfs/tree-log.h | 18 +++- 8 files changed, 277 insertions(+), 106 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index c47b6c6fea9f..00f7831d0902 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -25,6 +25,11 @@ enum { BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, + /* + * Always set under the VFS' inode lock, otherwise it can cause races + * during fsync (we start as a fast fsync and then end up in a full + * fsync racing with ordered extent completion). + */ BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index de14ed402390..b859d5ae7f80 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2114,20 +2114,24 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) struct btrfs_trans_handle *trans; struct btrfs_log_ctx ctx; int ret = 0, err; + u64 len; + bool full_sync; trace_btrfs_sync_file(file, datasync); btrfs_init_log_ctx(&ctx, inode); /* - * Set the range to full if the NO_HOLES feature is not enabled. - * This is to avoid missing file extent items representing holes after - * replaying the log. + * Always set the range to a full range, otherwise we can get into + * several problems, from missing file extent items to represent holes + * when not using the NO_HOLES feature, to log tree corruption due to + * races between hole detection during logging and completion of ordered + * extents outside the range, to missing checksums due to ordered extents + * for which we flushed only a subset of their pages. */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES)) { - start = 0; - end = LLONG_MAX; - } + start = 0; + end = LLONG_MAX; + len = (u64)LLONG_MAX + 1; /* * We write the dirty pages in the range and wait until they complete @@ -2151,19 +2155,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); /* - * If the inode needs a full sync, make sure we use a full range to - * avoid log tree corruption, due to hole detection racing with ordered - * extent completion for adjacent ranges and races between logging and - * completion of ordered extents for adjancent ranges - both races - * could lead to file extent items in the log with overlapping ranges. - * Do this while holding the inode lock, to avoid races with other - * tasks. + * Always check for the full sync flag while holding the inode's lock, + * to avoid races with other tasks. The flag must be either set all the + * time during logging or always off all the time while logging. */ - if (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags)) { - start = 0; - end = LLONG_MAX; - } + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); /* * Before we acquired the inode's lock, someone may have dirtied more @@ -2194,20 +2191,42 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * We have to do this here to avoid the priority inversion of waiting on * IO of a lower priority task while holding a transaction open. * - * Also, the range length can be represented by u64, we have to do the - * typecasts to avoid signed overflow if it's [0, LLONG_MAX]. + * For a full fsync we wait for the ordered extents to complete while + * for a fast fsync we wait just for writeback to complete, and then + * attach the ordered extents to the transaction so that a transaction + * commit waits for their completion, to avoid data loss if we fsync, + * the current transaction commits before the ordered extents complete + * and a power failure happens right after that. */ - ret = btrfs_wait_ordered_range(inode, start, (u64)end - (u64)start + 1); - if (ret) { - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + if (full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + } else { + /* + * Get our ordered extents as soon as possible to avoid doing + * checksum lookups in the csum tree, and use instead the + * checksums attached to the ordered extents. + */ + btrfs_get_ordered_extents_for_logging(BTRFS_I(inode), + &ctx.ordered_extents); + ret = filemap_fdatawait_range(inode->i_mapping, start, end); } + + if (ret) + goto out_release_extents; + atomic_inc(&root->log_batch); + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ smp_mb(); if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed) { + (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && + (full_sync || list_empty(&ctx.ordered_extents)))) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2223,9 +2242,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * checked called fsync. */ ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } /* @@ -2242,12 +2259,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - up_write(&BTRFS_I(inode)->dio_sem); - inode_unlock(inode); - goto out; + goto out_release_extents; } - ret = btrfs_log_dentry_safe(trans, dentry, start, end, &ctx); + ret = btrfs_log_dentry_safe(trans, dentry, &ctx); + btrfs_release_log_ctx_extents(&ctx); if (ret < 0) { /* Fallthrough and commit/free transaction. */ ret = 1; @@ -2274,6 +2290,13 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) goto out; } } + if (!full_sync) { + ret = btrfs_wait_ordered_range(inode, start, len); + if (ret) { + btrfs_end_transaction(trans); + goto out; + } + } ret = btrfs_commit_transaction(trans); } else { ret = btrfs_end_transaction(trans); @@ -2284,6 +2307,12 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (!ret) ret = err; return ret > 0 ? -EIO : ret; + +out_release_extents: + btrfs_release_log_ctx_extents(&ctx); + up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; } static const struct vm_operations_struct btrfs_file_vm_ops = { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index ebac13389e7e..4732c5b89460 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -212,6 +212,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset refcount_set(&entry->refs, 1); init_waitqueue_head(&entry->wait); INIT_LIST_HEAD(&entry->list); + INIT_LIST_HEAD(&entry->log_list); INIT_LIST_HEAD(&entry->root_extent_list); INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); @@ -445,6 +446,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); + ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); if (entry->inode) btrfs_add_delayed_iput(entry->inode); @@ -470,6 +472,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; struct rb_node *node; + bool pending; /* This is paired with btrfs_add_ordered_extent. */ spin_lock(&btrfs_inode->lock); @@ -491,8 +494,36 @@ void btrfs_remove_ordered_extent(struct inode *inode, if (tree->last == node) tree->last = NULL; set_bit(BTRFS_ORDERED_COMPLETE, &entry->flags); + pending = test_and_clear_bit(BTRFS_ORDERED_PENDING, &entry->flags); spin_unlock_irq(&tree->lock); + /* + * The current running transaction is waiting on us, we need to let it + * know that we're complete and wake it up. + */ + if (pending) { + struct btrfs_transaction *trans; + + /* + * The checks for trans are just a formality, it should be set, + * but if it isn't we don't want to deref/assert under the spin + * lock, so be nice and check if trans is set, but ASSERT() so + * if it isn't set a developer will notice. + */ + spin_lock(&fs_info->trans_lock); + trans = fs_info->running_transaction; + if (trans) + refcount_inc(&trans->use_count); + spin_unlock(&fs_info->trans_lock); + + ASSERT(trans); + if (trans) { + if (atomic_dec_and_test(&trans->pending_ordered)) + wake_up(&trans->pending_wait); + btrfs_put_transaction(trans); + } + } + spin_lock(&root->ordered_extent_lock); list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; @@ -774,6 +805,34 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( return entry; } +/* + * Adds all ordered extents to the given list. The list ends up sorted by the + * file_offset of the ordered extents. + */ +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list) +{ + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; + struct rb_node *n; + + ASSERT(inode_is_locked(&inode->vfs_inode)); + + spin_lock_irq(&tree->lock); + for (n = rb_first(&tree->tree); n; n = rb_next(n)) { + struct btrfs_ordered_extent *ordered; + + ordered = rb_entry(n, struct btrfs_ordered_extent, rb_node); + + if (test_bit(BTRFS_ORDERED_LOGGED, &ordered->flags)) + continue; + + ASSERT(list_empty(&ordered->log_list)); + list_add_tail(&ordered->log_list, list); + refcount_inc(&ordered->refs); + } + spin_unlock_irq(&tree->lock); +} + /* * lookup and return any extent before 'file_offset'. NULL is returned * if none is found diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index d61ea9c880a3..644258a7dfb1 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -56,6 +56,12 @@ enum { BTRFS_ORDERED_TRUNCATED, /* Regular IO for COW */ BTRFS_ORDERED_REGULAR, + /* Used during fsync to track already logged extents */ + BTRFS_ORDERED_LOGGED, + /* We have already logged all the csums of the ordered extent */ + BTRFS_ORDERED_LOGGED_CSUM, + /* We wait for this extent to complete in the current transaction */ + BTRFS_ORDERED_PENDING, }; struct btrfs_ordered_extent { @@ -104,6 +110,9 @@ struct btrfs_ordered_extent { /* list of checksums for insertion when the extent io is done */ struct list_head list; + /* used for fast fsyncs */ + struct list_head log_list; + /* used to wait for the BTRFS_ORDERED_COMPLETE bit */ wait_queue_head_t wait; @@ -174,6 +183,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, u64 len); +void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, + struct list_head *list); int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index d2fc292ac61b..f8265dbec9c3 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -292,6 +292,8 @@ static noinline int join_transaction(struct btrfs_fs_info *fs_info, } cur_trans->fs_info = fs_info; + atomic_set(&cur_trans->pending_ordered, 0); + init_waitqueue_head(&cur_trans->pending_wait); atomic_set(&cur_trans->num_writers, 1); extwriter_counter_init(cur_trans, type); init_waitqueue_head(&cur_trans->writer_wait); @@ -2165,6 +2167,14 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) btrfs_wait_delalloc_flush(trans); + /* + * Wait for all ordered extents started by a fast fsync that joined this + * transaction. Otherwise if this transaction commits before the ordered + * extents complete we lose logged data after a power failure. + */ + wait_event(cur_trans->pending_wait, + atomic_read(&cur_trans->pending_ordered) == 0); + btrfs_scrub_pause(fs_info); /* * Ok now we need to make sure to block out any other joins while we diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index d60b055b8695..8241c050ba71 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -85,6 +85,13 @@ struct btrfs_transaction { spinlock_t dropped_roots_lock; struct btrfs_delayed_ref_root delayed_refs; struct btrfs_fs_info *fs_info; + + /* + * Number of ordered extents the transaction must wait for before + * committing. These are ordered extents started by a fast fsync. + */ + atomic_t pending_ordered; + wait_queue_head_t pending_wait; }; #define __TRANS_FREEZABLE (1U << 0) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e86fe7bbba82..e35cfe8cf68b 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -96,8 +96,6 @@ enum { static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); static int link_to_fixup_dir(struct btrfs_trans_handle *trans, struct btrfs_root *root, @@ -4080,10 +4078,14 @@ static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) static int log_extent_csums(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct btrfs_root *log_root, - const struct extent_map *em) + const struct extent_map *em, + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; u64 csum_offset; u64 csum_len; + u64 mod_start = em->mod_start; + u64 mod_len = em->mod_len; LIST_HEAD(ordered_sums); int ret = 0; @@ -4092,13 +4094,71 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, em->block_start == EXTENT_MAP_HOLE) return 0; + list_for_each_entry(ordered, &ctx->ordered_extents, log_list) { + const u64 ordered_end = ordered->file_offset + ordered->num_bytes; + const u64 mod_end = mod_start + mod_len; + struct btrfs_ordered_sum *sums; + + if (mod_len == 0) + break; + + if (ordered_end <= mod_start) + continue; + if (mod_end <= ordered->file_offset) + break; + + /* + * We are going to copy all the csums on this ordered extent, so + * go ahead and adjust mod_start and mod_len in case this ordered + * extent has already been logged. + */ + if (ordered->file_offset > mod_start) { + if (ordered_end >= mod_end) + mod_len = ordered->file_offset - mod_start; + /* + * If we have this case + * + * |--------- logged extent ---------| + * |----- ordered extent ----| + * + * Just don't mess with mod_start and mod_len, we'll + * just end up logging more csums than we need and it + * will be ok. + */ + } else { + if (ordered_end < mod_end) { + mod_len = mod_end - ordered_end; + mod_start = ordered_end; + } else { + mod_len = 0; + } + } + + /* + * To keep us from looping for the above case of an ordered + * extent that falls inside of the logged extent. + */ + if (test_and_set_bit(BTRFS_ORDERED_LOGGED_CSUM, &ordered->flags)) + continue; + + list_for_each_entry(sums, &ordered->list, list) { + ret = log_csums(trans, inode, log_root, sums); + if (ret) + return ret; + } + } + + /* We're done, found all csums in the ordered extents. */ + if (mod_len == 0) + return 0; + /* If we're compressed we have to save the entire range of csums. */ if (em->compress_type) { csum_offset = 0; csum_len = max(em->block_len, em->orig_block_len); } else { - csum_offset = em->mod_start - em->start; - csum_len = em->mod_len; + csum_offset = mod_start - em->start; + csum_len = mod_len; } /* block start is already adjusted for the file extent offset. */ @@ -4138,7 +4198,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, int ret; int extent_inserted = 0; - ret = log_extent_csums(trans, inode, log, em); + ret = log_extent_csums(trans, inode, log, em, ctx); if (ret) return ret; @@ -4340,10 +4400,10 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_path *path, - struct btrfs_log_ctx *ctx, - const u64 start, - const u64 end) + struct btrfs_log_ctx *ctx) { + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; struct extent_map *em, *n; struct list_head extents; struct extent_map_tree *tree = &inode->extent_tree; @@ -4357,23 +4417,6 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, test_gen = root->fs_info->last_trans_committed; list_for_each_entry_safe(em, n, &tree->modified_extents, list) { - /* - * Skip extents outside our logging range. It's important to do - * it for correctness because if we don't ignore them, we may - * log them before their ordered extent completes, and therefore - * we could log them without logging their respective checksums - * (the checksum items are added to the csum tree at the very - * end of btrfs_finish_ordered_io()). Also leave such extents - * outside of our range in the list, since we may have another - * ranged fsync in the near future that needs them. If an extent - * outside our range corresponds to a hole, log it to avoid - * leaving gaps between extents (fsck will complain when we are - * not using the NO_HOLES feature). - */ - if ((em->start > end || em->start + em->len <= start) && - em->block_start != EXTENT_MAP_HOLE) - continue; - list_del_init(&em->list); /* * Just an arbitrary number, this can be really CPU intensive @@ -4432,8 +4475,32 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (!ret) ret = btrfs_log_prealloc_extents(trans, inode, path); + if (ret) + return ret; - return ret; + /* + * We have logged all extents successfully, now make sure the commit of + * the current transaction waits for the ordered extents to complete + * before it commits and wipes out the log trees, otherwise we would + * lose data if an ordered extents completes after the transaction + * commits and a power failure happens after the transaction commit. + */ + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + set_bit(BTRFS_ORDERED_LOGGED, &ordered->flags); + + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + spin_lock_irq(&inode->ordered_tree.lock); + if (!test_bit(BTRFS_ORDERED_COMPLETE, &ordered->flags)) { + set_bit(BTRFS_ORDERED_PENDING, &ordered->flags); + atomic_inc(&trans->transaction->pending_ordered); + } + spin_unlock_irq(&inode->ordered_tree.lock); + } + btrfs_put_ordered_extent(ordered); + } + + return 0; } static int logged_inode_size(struct btrfs_root *log, struct btrfs_inode *inode, @@ -4839,7 +4906,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, ret = btrfs_log_inode(trans, root, BTRFS_I(inode), LOG_OTHER_INODE_ALL, - 0, LLONG_MAX, ctx); + ctx); btrfs_add_delayed_iput(inode); } } @@ -4897,7 +4964,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * log with the new name before we unpin it. */ ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_OTHER_INODE, 0, LLONG_MAX, ctx); + LOG_OTHER_INODE, ctx); if (ret) { btrfs_add_delayed_iput(inode); continue; @@ -5110,8 +5177,6 @@ static int copy_inode_items_to_log(struct btrfs_trans_handle *trans, static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, int inode_only, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct btrfs_path *path; @@ -5290,7 +5355,7 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, } if (fast_search) { ret = btrfs_log_changed_extents(trans, root, inode, dst_path, - ctx, start, end); + ctx); if (ret) { err = ret; goto out_unlock; @@ -5299,31 +5364,8 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, struct extent_map *em, *n; write_lock(&em_tree->lock); - /* - * We can't just remove every em if we're called for a ranged - * fsync - that is, one that doesn't cover the whole possible - * file range (0 to LLONG_MAX). This is because we can have - * em's that fall outside the range we're logging and therefore - * their ordered operations haven't completed yet - * (btrfs_finish_ordered_io() not invoked yet). This means we - * didn't get their respective file extent item in the fs/subvol - * tree yet, and need to let the next fast fsync (one which - * consults the list of modified extent maps) find the em so - * that it logs a matching file extent item and waits for the - * respective ordered operation to complete (if it's still - * running). - * - * Removing every em outside the range we're logging would make - * the next fast fsync not log their matching file extent items, - * therefore making us lose data after a log replay. - */ - list_for_each_entry_safe(em, n, &em_tree->modified_extents, - list) { - const u64 mod_end = em->mod_start + em->mod_len - 1; - - if (em->mod_start >= start && mod_end <= end) - list_del_init(&em->list); - } + list_for_each_entry_safe(em, n, &em_tree->modified_extents, list) + list_del_init(&em->list); write_unlock(&em_tree->lock); } @@ -5604,7 +5646,7 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK) log_mode = LOG_INODE_ALL; ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode), - log_mode, 0, LLONG_MAX, ctx); + log_mode, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(di_inode))) ret = 1; @@ -5748,7 +5790,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, if (ctx) ctx->log_new_dentries = false; ret = btrfs_log_inode(trans, root, BTRFS_I(dir_inode), - LOG_INODE_ALL, 0, LLONG_MAX, ctx); + LOG_INODE_ALL, ctx); if (!ret && btrfs_must_commit_transaction(trans, BTRFS_I(dir_inode))) ret = 1; @@ -5799,8 +5841,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (BTRFS_I(inode)->generation > last_committed) ret = btrfs_log_inode(trans, root, BTRFS_I(inode), - LOG_INODE_EXISTS, - 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -5855,7 +5896,7 @@ static int log_new_ancestors_fast(struct btrfs_trans_handle *trans, if (inode->generation > fs_info->last_trans_committed) { ret = btrfs_log_inode(trans, root, inode, - LOG_INODE_EXISTS, 0, LLONG_MAX, ctx); + LOG_INODE_EXISTS, ctx); if (ret) break; } @@ -5963,8 +6004,6 @@ static int log_all_new_ancestors(struct btrfs_trans_handle *trans, static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, struct dentry *parent, - const loff_t start, - const loff_t end, int inode_only, struct btrfs_log_ctx *ctx) { @@ -6017,7 +6056,7 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (ret) goto end_no_trans; - ret = btrfs_log_inode(trans, root, inode, inode_only, start, end, ctx); + ret = btrfs_log_inode(trans, root, inode, inode_only, ctx); if (ret) goto end_trans; @@ -6113,15 +6152,13 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, */ int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx) { struct dentry *parent = dget_parent(dentry); int ret; ret = btrfs_log_inode_parent(trans, BTRFS_I(d_inode(dentry)), parent, - start, end, LOG_INODE_ALL, ctx); + LOG_INODE_ALL, ctx); dput(parent); return ret; @@ -6416,7 +6453,6 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, * we don't need to worry about getting a log committed that has an * inconsistent state after a rename operation. */ - btrfs_log_inode_parent(trans, inode, parent, 0, LLONG_MAX, - LOG_INODE_EXISTS, &ctx); + btrfs_log_inode_parent(trans, inode, parent, LOG_INODE_EXISTS, &ctx); } diff --git a/fs/btrfs/tree-log.h b/fs/btrfs/tree-log.h index ddfc6789d9bf..731bd9c029f5 100644 --- a/fs/btrfs/tree-log.h +++ b/fs/btrfs/tree-log.h @@ -19,6 +19,8 @@ struct btrfs_log_ctx { bool logging_new_name; struct inode *inode; struct list_head list; + /* Only used for fast fsyncs. */ + struct list_head ordered_extents; }; static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, @@ -30,6 +32,20 @@ static inline void btrfs_init_log_ctx(struct btrfs_log_ctx *ctx, ctx->logging_new_name = false; ctx->inode = inode; INIT_LIST_HEAD(&ctx->list); + INIT_LIST_HEAD(&ctx->ordered_extents); +} + +static inline void btrfs_release_log_ctx_extents(struct btrfs_log_ctx *ctx) +{ + struct btrfs_ordered_extent *ordered; + struct btrfs_ordered_extent *tmp; + + ASSERT(inode_is_locked(ctx->inode)); + + list_for_each_entry_safe(ordered, tmp, &ctx->ordered_extents, log_list) { + list_del_init(&ordered->log_list); + btrfs_put_ordered_extent(ordered); + } } static inline void btrfs_set_log_full_commit(struct btrfs_trans_handle *trans) @@ -51,8 +67,6 @@ int btrfs_free_log_root_tree(struct btrfs_trans_handle *trans, int btrfs_recover_log_trees(struct btrfs_root *tree_root); int btrfs_log_dentry_safe(struct btrfs_trans_handle *trans, struct dentry *dentry, - const loff_t start, - const loff_t end, struct btrfs_log_ctx *ctx); int btrfs_del_dir_entries_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *root, From 24646481fb19a3bd86933ec30480454381ff9860 Mon Sep 17 00:00:00 2001 From: Leon Romanovsky Date: Wed, 19 Aug 2020 17:16:28 +0300 Subject: [PATCH 145/271] btrfs: sysfs: fix unused-but-set-variable warnings The compilation with W=1 generates the following warnings: fs/btrfs/sysfs.c:1630:6: warning: variable 'ret' set but not used [-Wunused-but-set-variable] 1630 | int ret; | ^~~ fs/btrfs/sysfs.c:1629:6: warning: variable 'features' set but not used [-Wunused-but-set-variable] 1629 | u64 features; | ^~~~~~~~ [ The unused variables are leftover from e410e34fad91 ("Revert "btrfs: synchronize incompat feature bits with sysfs files""), which needs to be properly fixed by moving feature bit manipulation from the sysfs context. Silence the warning to save pepople time, we got several reports. ] Signed-off-by: Leon Romanovsky Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index fcc8757b4e93..81496ac43d09 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1630,12 +1630,16 @@ void btrfs_sysfs_feature_update(struct btrfs_fs_info *fs_info, { struct btrfs_fs_devices *fs_devs; struct kobject *fsid_kobj; - u64 features; - int ret; + u64 __maybe_unused features; + int __maybe_unused ret; if (!fs_info) return; + /* + * See 14e46e04958df74 and e410e34fad913dd, feature bit updates are not + * safe when called from some contexts (eg. balance) + */ features = get_features(fs_info, set); ASSERT(bit & supported_feature_masks[set]); From 4c448ce8b48fb65536c903f1ed8a80554838508e Mon Sep 17 00:00:00 2001 From: Marcos Paulo de Souza Date: Mon, 17 Aug 2020 10:56:10 -0300 Subject: [PATCH 146/271] btrfs: make read_block_group_item return void Since it's inclusion on 9afc66498a0b ("btrfs: block-group: refactor how we read one block group item") this function always returned 0, so there is no need to check for the returned value. Reviewed-by: Nikolay Borisov Signed-off-by: Marcos Paulo de Souza Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c9d7465501a4..01e8ba1da1d3 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1873,7 +1873,7 @@ static int check_chunk_block_group_mappings(struct btrfs_fs_info *fs_info) return ret; } -static int read_block_group_item(struct btrfs_block_group *cache, +static void read_block_group_item(struct btrfs_block_group *cache, struct btrfs_path *path, const struct btrfs_key *key) { @@ -1887,8 +1887,6 @@ static int read_block_group_item(struct btrfs_block_group *cache, sizeof(bgi)); cache->used = btrfs_stack_block_group_used(&bgi); cache->flags = btrfs_stack_block_group_flags(&bgi); - - return 0; } static int read_one_block_group(struct btrfs_fs_info *info, @@ -1907,9 +1905,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, if (!cache) return -ENOMEM; - ret = read_block_group_item(cache, path, key); - if (ret < 0) - goto error; + read_block_group_item(cache, path, key); set_free_space_tree_thresholds(cache); From 154f7cb86809a3a796bffbc7a5a7ce0dee585eaa Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Thu, 20 Aug 2020 15:42:46 +0800 Subject: [PATCH 147/271] btrfs: add owner and fs_info to alloc_state io_tree Commit 1c11b63eff2a ("btrfs: replace pending/pinned chunks lists with io tree") introduced btrfs_device::alloc_state extent io tree, but it doesn't initialize the fs_info and owner member. This means the following features are not properly supported: - Fs owner report for insert_state() error Without fs_info initialized, although btrfs_err() won't panic, it won't output which fs is causing the error. - Wrong owner for trace events alloc_state will get the owner as pinned extents. Fix this by assiging proper fs_info and owner for btrfs_device::alloc_state. Fixes: 1c11b63eff2a ("btrfs: replace pending/pinned chunks lists with io tree") Reviewed-by: Nikolay Borisov Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent-io-tree.h | 1 + fs/btrfs/volumes.c | 7 ++++--- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 219a09a2b734..250b8cbaaf97 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -48,6 +48,7 @@ enum { IO_TREE_INODE_FILE_EXTENT, IO_TREE_LOG_CSUM_RANGE, IO_TREE_SELFTEST, + IO_TREE_DEVICE_ALLOC_STATE, }; struct extent_io_tree { diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5d73c287314a..a028a64b4add 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -406,7 +406,7 @@ void __exit btrfs_cleanup_fs_uuids(void) * Returned struct is not linked onto any lists and must be destroyed using * btrfs_free_device. */ -static struct btrfs_device *__alloc_device(void) +static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info) { struct btrfs_device *dev; @@ -433,7 +433,8 @@ static struct btrfs_device *__alloc_device(void) btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); - extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL); + extent_io_tree_init(fs_info, &dev->alloc_state, + IO_TREE_DEVICE_ALLOC_STATE, NULL); return dev; } @@ -6532,7 +6533,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, if (WARN_ON(!devid && !fs_info)) return ERR_PTR(-EINVAL); - dev = __alloc_device(); + dev = __alloc_device(fs_info); if (IS_ERR(dev)) return dev; From f85781fb505ec02891734e77f7c69a9c85c99ec3 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Mon, 17 Aug 2020 11:18:21 -0500 Subject: [PATCH 148/271] btrfs: switch to iomap for direct IO We're using direct io implementation based on buffer heads. This patch switches to the new iomap infrastructure. Switch from __blockdev_direct_IO() to iomap_dio_rw(). Rename btrfs_get_blocks_direct() to btrfs_dio_iomap_begin() and use it as iomap_begin() for iomap direct I/O functions. This function allocates and locks all the blocks required for the I/O. btrfs_submit_direct() is used as the submit_io() hook for direct I/O ops. Since we need direct I/O reads to go through iomap_dio_rw(), we change file_operations.read_iter() to a btrfs_file_read_iter() which calls btrfs_direct_IO() for direct reads and falls back to generic_file_buffered_read() for incomplete reads and buffered reads. We don't need address_space.direct_IO() anymore: set it to noop. Similarly, we don't need flags used in __blockdev_direct_IO(). iomap is capable of direct I/O reads from a hole, so we don't need to return -ENOENT. Btrfs direct I/O is now done under i_rwsem, shared in case of reads and exclusive in case of writes. This guards against simultaneous truncates. Use iomap->iomap_end() to check for failed or incomplete direct I/O: - for writes, call __endio_write_update_ordered() - for reads, unlock extents btrfs_dio_data is now hooked in iomap->private and not current->journal_info. It carries the reservation variable and the amount of data submitted, so we can calculate the amount of data to call __endio_write_update_ordered in case of an error. This patch removes last use of struct buffer_head from btrfs. Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/Kconfig | 1 + fs/btrfs/ctree.h | 1 + fs/btrfs/file.c | 21 ++- fs/btrfs/inode.c | 325 ++++++++++++++++++++++------------------------- 4 files changed, 176 insertions(+), 172 deletions(-) diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index 575636f6491e..68b95ad82126 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -14,6 +14,7 @@ config BTRFS_FS select LZO_DECOMPRESS select ZSTD_COMPRESS select ZSTD_DECOMPRESS + select FS_IOMAP select RAID6_PQ select XOR_BLOCKS select SRCU diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f2d4c14fc273..eb7adc069926 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3020,6 +3020,7 @@ int btrfs_writepage_cow_fixup(struct page *page, u64 start, u64 end); void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate); extern const struct dentry_operations btrfs_dentry_operations; +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter); /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b859d5ae7f80..b62679382799 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1870,7 +1870,7 @@ static ssize_t __btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from) loff_t endbyte; int err; - written = generic_file_direct_write(iocb, from); + written = btrfs_direct_IO(iocb, from); if (written < 0 || !iov_iter_count(from)) return written; @@ -3568,9 +3568,26 @@ static int btrfs_file_open(struct inode *inode, struct file *filp) return generic_file_open(inode, filp); } +static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + ssize_t ret = 0; + + if (iocb->ki_flags & IOCB_DIRECT) { + struct inode *inode = file_inode(iocb->ki_filp); + + inode_lock_shared(inode); + ret = btrfs_direct_IO(iocb, to); + inode_unlock_shared(inode); + if (ret < 0) + return ret; + } + + return generic_file_buffered_read(iocb, to, ret); +} + const struct file_operations btrfs_file_operations = { .llseek = btrfs_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = btrfs_file_read_iter, .splice_read = generic_file_splice_read, .write_iter = btrfs_file_write_iter, .splice_write = iter_file_splice_write, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index e8a35db53369..4f0b1dbd3240 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6,7 +6,6 @@ #include #include #include -#include #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include "misc.h" #include "ctree.h" @@ -59,9 +59,9 @@ struct btrfs_iget_args { struct btrfs_dio_data { u64 reserve; - u64 unsubmitted_oe_range_start; - u64 unsubmitted_oe_range_end; - int overwrite; + loff_t length; + ssize_t submitted; + struct extent_changeset *data_reserved; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7103,7 +7103,7 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, } static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, - struct extent_state **cached_state, int writing) + struct extent_state **cached_state, bool writing) { struct btrfs_ordered_extent *ordered; int ret = 0; @@ -7241,30 +7241,7 @@ static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start, } -static int btrfs_get_blocks_direct_read(struct extent_map *em, - struct buffer_head *bh_result, - struct inode *inode, - u64 start, u64 len) -{ - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - - if (em->block_start == EXTENT_MAP_HOLE || - test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - return -ENOENT; - - len = min(len, em->len - (start - em->start)); - - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - return 0; -} - static int btrfs_get_blocks_direct_write(struct extent_map **map, - struct buffer_head *bh_result, struct inode *inode, struct btrfs_dio_data *dio_data, u64 start, u64 len) @@ -7325,7 +7302,6 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, } /* this will cow the extent */ - len = bh_result->b_size; free_extent_map(em); *map = em = btrfs_new_extent_direct(BTRFS_I(inode), start, len); if (IS_ERR(em)) { @@ -7336,64 +7312,76 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, len = min(len, em->len - (start - em->start)); skip_cow: - bh_result->b_blocknr = (em->block_start + (start - em->start)) >> - inode->i_blkbits; - bh_result->b_size = len; - bh_result->b_bdev = fs_info->fs_devices->latest_bdev; - set_buffer_mapped(bh_result); - - if (!test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) - set_buffer_new(bh_result); - /* * Need to update the i_size under the extent lock so buffered * readers will get the updated i_size when we unlock. */ - if (!dio_data->overwrite && start + len > i_size_read(inode)) + if (start + len > i_size_read(inode)) i_size_write(inode, start + len); - WARN_ON(dio_data->reserve < len); dio_data->reserve -= len; - dio_data->unsubmitted_oe_range_end = start + len; - current->journal_info = dio_data; out: return ret; } -static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, + loff_t length, unsigned int flags, struct iomap *iomap, + struct iomap *srcmap) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct extent_map *em; struct extent_state *cached_state = NULL; struct btrfs_dio_data *dio_data = NULL; - u64 start = iblock << inode->i_blkbits; u64 lockstart, lockend; - u64 len = bh_result->b_size; + const bool write = !!(flags & IOMAP_WRITE); int ret = 0; + u64 len = length; + bool unlock_extents = false; - if (!create) + if (!write) len = min_t(u64, len, fs_info->sectorsize); lockstart = start; lockend = start + len - 1; - if (current->journal_info) { - /* - * Need to pull our outstanding extents and set journal_info to NULL so - * that anything that needs to check if there's a transaction doesn't get - * confused. - */ - dio_data = current->journal_info; - current->journal_info = NULL; + /* + * The generic stuff only does filemap_write_and_wait_range, which + * isn't enough if we've written compressed pages to this area, so we + * need to flush the dirty pages again to make absolutely sure that any + * outstanding dirty pages are on disk. + */ + if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, + &BTRFS_I(inode)->runtime_flags)) { + ret = filemap_fdatawrite_range(inode->i_mapping, start, + start + length - 1); + if (ret) + return ret; } + dio_data = kzalloc(sizeof(*dio_data), GFP_NOFS); + if (!dio_data) + return -ENOMEM; + + dio_data->length = length; + if (write) { + dio_data->reserve = round_up(length, fs_info->sectorsize); + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), + &dio_data->data_reserved, + start, dio_data->reserve); + if (ret) { + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + return ret; + } + } + iomap->private = dio_data; + + /* * If this errors out it's because we couldn't invalidate pagecache for * this range and we need to fallback to buffered. */ - if (lock_extent_direct(inode, lockstart, lockend, &cached_state, - create)) { + if (lock_extent_direct(inode, lockstart, lockend, &cached_state, write)) { ret = -ENOTBLK; goto err; } @@ -7425,36 +7413,48 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, goto unlock_err; } - if (create) { - ret = btrfs_get_blocks_direct_write(&em, bh_result, inode, - dio_data, start, len); + len = min(len, em->len - (start - em->start)); + if (write) { + ret = btrfs_get_blocks_direct_write(&em, inode, dio_data, + start, len); if (ret < 0) goto unlock_err; - - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + unlock_extents = true; + /* Recalc len in case the new em is smaller than requested */ + len = min(len, em->len - (start - em->start)); } else { - ret = btrfs_get_blocks_direct_read(em, bh_result, inode, - start, len); - /* Can be negative only if we read from a hole */ - if (ret < 0) { - ret = 0; - free_extent_map(em); - goto unlock_err; - } /* * We need to unlock only the end area that we aren't using. * The rest is going to be unlocked by the endio routine. */ - lockstart = start + bh_result->b_size; - if (lockstart < lockend) { - unlock_extent_cached(&BTRFS_I(inode)->io_tree, - lockstart, lockend, &cached_state); - } else { - free_extent_state(cached_state); - } + lockstart = start + len; + if (lockstart < lockend) + unlock_extents = true; } + if (unlock_extents) + unlock_extent_cached(&BTRFS_I(inode)->io_tree, + lockstart, lockend, &cached_state); + else + free_extent_state(cached_state); + + /* + * Translate extent map information to iomap. + * We trim the extents (and move the addr) even though iomap code does + * that, since we have locked only the parts we are performing I/O in. + */ + if ((em->block_start == EXTENT_MAP_HOLE) || + (test_bit(EXTENT_FLAG_PREALLOC, &em->flags) && !write)) { + iomap->addr = IOMAP_NULL_ADDR; + iomap->type = IOMAP_HOLE; + } else { + iomap->addr = em->block_start + (start - em->start); + iomap->type = IOMAP_MAPPED; + } + iomap->offset = start; + iomap->bdev = fs_info->fs_devices->latest_bdev; + iomap->length = len; + free_extent_map(em); return 0; @@ -7463,8 +7463,55 @@ static int btrfs_get_blocks_direct(struct inode *inode, sector_t iblock, unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, &cached_state); err: - if (dio_data) - current->journal_info = dio_data; + if (dio_data) { + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, start, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->reserve); + extent_changeset_free(dio_data->data_reserved); + kfree(dio_data); + } + return ret; +} + +static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, + ssize_t written, unsigned int flags, struct iomap *iomap) +{ + int ret = 0; + struct btrfs_dio_data *dio_data = iomap->private; + size_t submitted = dio_data->submitted; + const bool write = !!(flags & IOMAP_WRITE); + + if (!write && (iomap->type == IOMAP_HOLE)) { + /* If reading from a hole, unlock and return */ + unlock_extent(&BTRFS_I(inode)->io_tree, pos, pos + length - 1); + goto out; + } + + if (submitted < length) { + pos += submitted; + length -= submitted; + if (write) + __endio_write_update_ordered(BTRFS_I(inode), pos, + length, false); + else + unlock_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1); + ret = -ENOTBLK; + } + + if (write) { + if (dio_data->reserve) + btrfs_delalloc_release_space(BTRFS_I(inode), + dio_data->data_reserved, pos, + dio_data->reserve, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), dio_data->length); + extent_changeset_free(dio_data->data_reserved); + } +out: + kfree(dio_data); + iomap->private = NULL; + return ret; } @@ -7488,7 +7535,7 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip) dip->logical_offset + dip->bytes - 1); } - dio_end_io(dip->dio_bio); + bio_endio(dip->dio_bio); kfree(dip); } @@ -7722,24 +7769,11 @@ static struct btrfs_dio_private *btrfs_create_dio_private(struct bio *dio_bio, dip->disk_bytenr = (u64)dio_bio->bi_iter.bi_sector << 9; dip->dio_bio = dio_bio; refcount_set(&dip->refs, 1); - - if (write) { - struct btrfs_dio_data *dio_data = current->journal_info; - - /* - * Setting range start and end to the same value means that - * no cleanup will happen in btrfs_direct_IO - */ - dio_data->unsubmitted_oe_range_end = dip->logical_offset + - dip->bytes; - dio_data->unsubmitted_oe_range_start = - dio_data->unsubmitted_oe_range_end; - } return dip; } -static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, - loff_t file_offset) +static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, + struct bio *dio_bio, loff_t file_offset) { const bool write = (bio_op(dio_bio) == REQ_OP_WRITE); const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM); @@ -7756,6 +7790,7 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, int ret; blk_status_t status; struct btrfs_io_geometry geom; + struct btrfs_dio_data *dio_data = iomap->private; dip = btrfs_create_dio_private(dio_bio, inode, file_offset); if (!dip) { @@ -7764,8 +7799,8 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, file_offset + dio_bio->bi_iter.bi_size - 1); } dio_bio->bi_status = BLK_STS_RESOURCE; - dio_end_io(dio_bio); - return; + bio_endio(dio_bio); + return BLK_QC_T_NONE; } if (!write && csum) { @@ -7836,15 +7871,17 @@ static void btrfs_submit_direct(struct bio *dio_bio, struct inode *inode, goto out_err; } + dio_data->submitted += clone_len; clone_offset += clone_len; start_sector += clone_len >> 9; file_offset += clone_len; } while (submit_len > 0); - return; + return BLK_QC_T_NONE; out_err: dip->dio_bio->bi_status = status; btrfs_dio_private_put(dip); + return BLK_QC_T_NONE; } static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, @@ -7880,37 +7917,30 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, return retval; } -static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) +static const struct iomap_ops btrfs_dio_iomap_ops = { + .iomap_begin = btrfs_dio_iomap_begin, + .iomap_end = btrfs_dio_iomap_end, +}; + +static const struct iomap_dio_ops btrfs_dio_ops = { + .submit_io = btrfs_submit_direct, +}; + +ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - struct btrfs_dio_data dio_data = { 0 }; struct extent_changeset *data_reserved = NULL; loff_t offset = iocb->ki_pos; size_t count = 0; - int flags = 0; - bool wakeup = true; bool relock = false; ssize_t ret; if (check_direct_IO(fs_info, iter, offset)) return 0; - inode_dio_begin(inode); - - /* - * The generic stuff only does filemap_write_and_wait_range, which - * isn't enough if we've written compressed pages to this area, so - * we need to flush the dirty pages again to make absolutely sure - * that any outstanding dirty pages are on disk. - */ count = iov_iter_count(iter); - if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_fdatawrite_range(inode->i_mapping, offset, - offset + count - 1); - if (iov_iter_rw(iter) == WRITE) { /* * If the write DIO is beyond the EOF, we need update @@ -7918,66 +7948,21 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - dio_data.overwrite = 1; inode_unlock(inode); relock = true; } - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - offset, count); - if (ret) - goto out; - - /* - * We need to know how many extents we reserved so that we can - * do the accounting properly if we go over the number we - * originally calculated. Abuse current->journal_info for this. - */ - dio_data.reserve = round_up(count, - fs_info->sectorsize); - dio_data.unsubmitted_oe_range_start = (u64)offset; - dio_data.unsubmitted_oe_range_end = (u64)offset; - current->journal_info = &dio_data; down_read(&BTRFS_I(inode)->dio_sem); - } else if (test_bit(BTRFS_INODE_READDIO_NEED_LOCK, - &BTRFS_I(inode)->runtime_flags)) { - inode_dio_end(inode); - flags = DIO_LOCKING | DIO_SKIP_HOLES; - wakeup = false; } - ret = __blockdev_direct_IO(iocb, inode, - fs_info->fs_devices->latest_bdev, - iter, btrfs_get_blocks_direct, NULL, - btrfs_submit_direct, flags); - if (iov_iter_rw(iter) == WRITE) { + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, + is_sync_kiocb(iocb)); + + if (ret == -ENOTBLK) + ret = 0; + + if (iov_iter_rw(iter) == WRITE) up_read(&BTRFS_I(inode)->dio_sem); - current->journal_info = NULL; - if (ret < 0 && ret != -EIOCBQUEUED) { - if (dio_data.reserve) - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, offset, dio_data.reserve, - true); - /* - * On error we might have left some ordered extents - * without submitting corresponding bios for them, so - * cleanup them up to avoid other tasks getting them - * and waiting for them to complete forever. - */ - if (dio_data.unsubmitted_oe_range_start < - dio_data.unsubmitted_oe_range_end) - __endio_write_update_ordered(BTRFS_I(inode), - dio_data.unsubmitted_oe_range_start, - dio_data.unsubmitted_oe_range_end - - dio_data.unsubmitted_oe_range_start, - false); - } else if (ret >= 0 && (size_t)ret < count) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - offset, count - (size_t)ret, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), count); - } -out: - if (wakeup) - inode_dio_end(inode); + if (relock) inode_lock(inode); @@ -10209,7 +10194,7 @@ static const struct address_space_operations btrfs_aops = { .writepage = btrfs_writepage, .writepages = btrfs_writepages, .readahead = btrfs_readahead, - .direct_IO = btrfs_direct_IO, + .direct_IO = noop_direct_IO, .invalidatepage = btrfs_invalidatepage, .releasepage = btrfs_releasepage, #ifdef CONFIG_MIGRATION From 0eb79294dbe328debae67961df28113141825d7b Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Sep 2020 11:16:51 -0400 Subject: [PATCH 149/271] btrfs: dio iomap DSYNC workaround iomap dio will run generic_write_sync() for us if the iocb is DSYNC. This is problematic for us because of 2 reasons: 1. we hold the inode_lock() during this operation, and we take it in generic_write_sync() 2. we hold a read lock on the dio_sem but take the write lock in fsync Since we don't want to rip out this code right now, but reworking the locking is a bit much to do at this point, work around this problem with this masterpiece of a patch. First, we clear DSYNC on the iocb so that the iomap stuff doesn't know that it needs to handle the sync. We save this fact in current->journal_info, because we need to see do special things once we're in iomap_begin, and we have no way to pass private information into iomap_dio_rw(). Next we specify a separate iomap_dio_ops for sync, which implements an ->end_io() callback that gets called when the dio completes. This is important for AIO, because we really do need to run generic_write_sync() if we complete asynchronously. However if we're still in the submitting context when we enter ->end_io() we clear the flag so that the submitter knows they're the ones that needs to run generic_write_sync(). This is meant to be temporary. We need to work out how to eliminate the inode_lock() and the dio_sem in our fsync and use another mechanism to protect these operations. Tested-by: Johannes Thumshirn Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 33 ++++++++++++++++++++++ fs/btrfs/inode.c | 62 ++++++++++++++++++++++++++++++++++++++++-- fs/btrfs/transaction.h | 1 + 3 files changed, 94 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index b62679382799..4bf15846efdc 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2023,7 +2023,40 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, atomic_inc(&BTRFS_I(inode)->sync_writers); if (iocb->ki_flags & IOCB_DIRECT) { + /* + * 1. We must always clear IOCB_DSYNC in order to not deadlock + * in iomap, as it calls generic_write_sync() in this case. + * 2. If we are async, we can call iomap_dio_complete() either + * in + * + * 2.1. A worker thread from the last bio completed. In this + * case we need to mark the btrfs_dio_data that it is + * async in order to call generic_write_sync() properly. + * This is handled by setting BTRFS_DIO_SYNC_STUB in the + * current->journal_info. + * 2.2 The submitter context, because all IO completed + * before we exited iomap_dio_rw(). In this case we can + * just re-set the IOCB_DSYNC on the iocb and we'll do + * the sync below. If our ->end_io() gets called and + * current->journal_info is set, then we know we're in + * our current context and we will clear + * current->journal_info to indicate that we need to + * sync below. + */ + if (sync) { + ASSERT(current->journal_info == NULL); + iocb->ki_flags &= ~IOCB_DSYNC; + current->journal_info = BTRFS_DIO_SYNC_STUB; + } num_written = __btrfs_direct_write(iocb, from); + + /* + * As stated above, we cleared journal_info, so we need to do + * the sync ourselves. + */ + if (sync && current->journal_info == NULL) + iocb->ki_flags |= IOCB_DSYNC; + current->journal_info = NULL; } else { num_written = btrfs_buffered_write(iocb, from); if (num_written > 0) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4f0b1dbd3240..366d5b768f86 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -62,6 +62,7 @@ struct btrfs_dio_data { loff_t length; ssize_t submitted; struct extent_changeset *data_reserved; + bool sync; }; static const struct inode_operations btrfs_dir_inode_operations; @@ -7337,6 +7338,17 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, int ret = 0; u64 len = length; bool unlock_extents = false; + bool sync = (current->journal_info == BTRFS_DIO_SYNC_STUB); + + /* + * We used current->journal_info here to see if we were sync, but + * there's a lot of tests in the enospc machinery to not do flushing if + * we have a journal_info set, so we need to clear this out and re-set + * it in iomap_end. + */ + ASSERT(current->journal_info == NULL || + current->journal_info == BTRFS_DIO_SYNC_STUB); + current->journal_info = NULL; if (!write) len = min_t(u64, len, fs_info->sectorsize); @@ -7362,6 +7374,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (!dio_data) return -ENOMEM; + dio_data->sync = sync; dio_data->length = length; if (write) { dio_data->reserve = round_up(length, fs_info->sectorsize); @@ -7509,6 +7522,14 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, extent_changeset_free(dio_data->data_reserved); } out: + /* + * We're all done, we can re-set the current->journal_info now safely + * for our endio. + */ + if (dio_data->sync) { + ASSERT(current->journal_info == NULL); + current->journal_info = BTRFS_DIO_SYNC_STUB; + } kfree(dio_data); iomap->private = NULL; @@ -7917,6 +7938,30 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info, return retval; } +static inline int btrfs_maybe_fsync_end_io(struct kiocb *iocb, ssize_t size, + int error, unsigned flags) +{ + /* + * Now if we're still in the context of our submitter we know we can't + * safely run generic_write_sync(), so clear our flag here so that the + * caller knows to follow up with a sync. + */ + if (current->journal_info == BTRFS_DIO_SYNC_STUB) { + current->journal_info = NULL; + return error; + } + + if (error) + return error; + + if (size) { + iocb->ki_flags |= IOCB_DSYNC; + return generic_write_sync(iocb, size); + } + + return 0; +} + static const struct iomap_ops btrfs_dio_iomap_ops = { .iomap_begin = btrfs_dio_iomap_begin, .iomap_end = btrfs_dio_iomap_end, @@ -7926,6 +7971,11 @@ static const struct iomap_dio_ops btrfs_dio_ops = { .submit_io = btrfs_submit_direct, }; +static const struct iomap_dio_ops btrfs_sync_dops = { + .submit_io = btrfs_submit_direct, + .end_io = btrfs_maybe_fsync_end_io, +}; + ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) { struct file *file = iocb->ki_filp; @@ -7954,8 +8004,16 @@ ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) down_read(&BTRFS_I(inode)->dio_sem); } - ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops, - is_sync_kiocb(iocb)); + /* + * We have are actually a sync iocb, so we need our fancy endio to know + * if we need to sync. + */ + if (current->journal_info) + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_sync_dops, is_sync_kiocb(iocb)); + else + ret = iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, + &btrfs_dio_ops, is_sync_kiocb(iocb)); if (ret == -ENOTBLK) ret = 0; diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 8241c050ba71..858d9153a1cd 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -112,6 +112,7 @@ struct btrfs_transaction { #define TRANS_EXTWRITERS (__TRANS_START | __TRANS_ATTACH) #define BTRFS_SEND_TRANS_STUB ((void *)1) +#define BTRFS_DIO_SYNC_STUB ((void *)2) struct btrfs_trans_handle { u64 transid; From 1028d1c48b95c37336bed24a1dc1594e70aa67c6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 3 Aug 2020 12:58:46 +0300 Subject: [PATCH 150/271] btrfs: remove err variable from btrfs_get_extent There's no practical reason too use 'err' as a variable to convey errors. In fact it's value is either set explicitly in the beginning of the function or it simply takes the value of 'ret'. Not conforming to the usual pattern of having ret be the only variable used to convey errors makes the code more error prone to bugs. In fact one such bug was introduced by 6bf9e4bd6a27 ("btrfs: inode: Verify inode mode toi avoid NULL pointer dereference") by assigning the error value to 'ret' and not 'err'. Let's fix that issue and make the function less tricky by leaving only ret to convey error values. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 366d5b768f86..a50a40f8bef2 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6534,8 +6534,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, u64 start, u64 len) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - int ret; - int err = 0; + int ret = 0; u64 extent_start = 0; u64 extent_end = 0; u64 objectid = btrfs_ino(inode); @@ -6563,7 +6562,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, } em = alloc_extent_map(); if (!em) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } em->start = EXTENT_MAP_HOLE; @@ -6573,7 +6572,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path = btrfs_alloc_path(); if (!path) { - err = -ENOMEM; + ret = -ENOMEM; goto out; } @@ -6588,12 +6587,12 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { - err = ret; goto out; } else if (ret > 0) { if (path->slots[0] == 0) goto not_found; path->slots[0]--; + ret = 0; } leaf = path->nodes[0]; @@ -6619,7 +6618,7 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, extent_type == BTRFS_FILE_EXTENT_PREALLOC) { /* Only regular file could have regular/prealloc extent */ if (!S_ISREG(inode->vfs_inode.i_mode)) { - err = -EUCLEAN; + ret = -EUCLEAN; btrfs_crit(fs_info, "regular/prealloc extent found for non-regular inode %llu", btrfs_ino(inode)); @@ -6637,12 +6636,11 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, path->slots[0]++; if (path->slots[0] >= btrfs_header_nritems(leaf)) { ret = btrfs_next_leaf(root, path); - if (ret < 0) { - err = ret; + if (ret < 0) goto out; - } else if (ret > 0) { + else if (ret > 0) goto not_found; - } + leaf = path->nodes[0]; } btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); @@ -6693,10 +6691,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, BTRFS_COMPRESS_NONE) { ret = uncompress_inline(path, page, pg_offset, extent_offset, item); - if (ret) { - err = ret; + if (ret) goto out; - } } else { map = kmap(page); read_extent_buffer(leaf, map + pg_offset, ptr, @@ -6720,27 +6716,27 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, em->len = len; em->block_start = EXTENT_MAP_HOLE; insert: + ret = 0; btrfs_release_path(path); if (em->start > start || extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); - err = -EIO; + ret = -EIO; goto out; } - err = 0; write_lock(&em_tree->lock); - err = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); + ret = btrfs_add_extent_mapping(fs_info, em_tree, &em, start, len); write_unlock(&em_tree->lock); out: btrfs_free_path(path); trace_btrfs_get_extent(root, inode, em); - if (err) { + if (ret) { free_extent_map(em); - return ERR_PTR(err); + return ERR_PTR(ret); } return em; } From dc0ab488d2cb514e08276cbbc846b2ebb6056c2a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 13:48:46 +0300 Subject: [PATCH 151/271] btrfs: factor out reada loop in __reada_start_machine This is in preparation for moving fs_devices to proper lists. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/reada.c | 20 +++++++++++++++----- 1 file changed, 15 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index 243a2e44526e..a6bfe7ec14cb 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -767,15 +767,13 @@ static void reada_start_machine_worker(struct btrfs_work *work) kfree(rmw); } -static void __reada_start_machine(struct btrfs_fs_info *fs_info) +/* Try to start up to 10k READA requests for a group of devices */ +static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) { - struct btrfs_device *device; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; u64 enqueued; u64 total = 0; - int i; + struct btrfs_device *device; -again: do { enqueued = 0; mutex_lock(&fs_devices->device_list_mutex); @@ -787,6 +785,18 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); + + return total; +} + +static void __reada_start_machine(struct btrfs_fs_info *fs_info) +{ + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + int i; + u64 enqueued = 0; + +again: + enqueued += reada_start_for_fsdevs(fs_devices); if (fs_devices->seed) { fs_devices = fs_devices->seed; goto again; From 3712ccb7f1cccd9371efdaaac4775aa79bdf5f40 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 16 Jul 2020 10:17:04 +0300 Subject: [PATCH 152/271] btrfs: factor out loop logic from btrfs_free_extra_devids This prepares the code to switching seeds devices to a proper list. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 34 ++++++++++++++++++++-------------- 1 file changed, 20 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index a028a64b4add..309b575613fe 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1036,28 +1036,21 @@ static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig) return ERR_PTR(ret); } -/* - * After we have read the system tree and know devids belonging to - * this filesystem, remove the device which does not belong there. - */ -void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, + int step, struct btrfs_device **latest_dev) { struct btrfs_device *device, *next; - struct btrfs_device *latest_dev = NULL; - mutex_lock(&uuid_mutex); -again: /* This is the initialized path, it is safe to release the devices. */ list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) { - if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, - &device->dev_state)) { + if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) { if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, - &device->dev_state) && + &device->dev_state) && !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state) && - (!latest_dev || - device->generation > latest_dev->generation)) { - latest_dev = device; + (!*latest_dev || + device->generation > (*latest_dev)->generation)) { + *latest_dev = device; } continue; } @@ -1095,6 +1088,19 @@ void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) btrfs_free_device(device); } +} + +/* + * After we have read the system tree and know devids belonging to this + * filesystem, remove the device which does not belong there. + */ +void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) +{ + struct btrfs_device *latest_dev = NULL; + + mutex_lock(&uuid_mutex); +again: + __btrfs_free_extra_devids(fs_devices, step, &latest_dev); if (fs_devices->seed) { fs_devices = fs_devices->seed; goto again; From 54eed6ae8d8e9df81202da64f0817d8e40d5db0e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 13:48:48 +0300 Subject: [PATCH 153/271] btrfs: make close_fs_devices return void The return value of this function conveys absolutely no information. All callers already check the state of fs_devices->opened to decide how to proceed. So convert the function to returning void. While at it make btrfs_close_devices also return void. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 12 ++++-------- fs/btrfs/volumes.h | 2 +- 2 files changed, 5 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 309b575613fe..e94a0eeeeb52 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1156,12 +1156,12 @@ static void btrfs_close_one_device(struct btrfs_device *device) ASSERT(atomic_read(&device->reada_in_flight) == 0); } -static int close_fs_devices(struct btrfs_fs_devices *fs_devices) +static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; if (--fs_devices->opened > 0) - return 0; + return; mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { @@ -1173,17 +1173,14 @@ static int close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; - - return 0; } -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_fs_devices *seed_devices = NULL; - int ret; mutex_lock(&uuid_mutex); - ret = close_fs_devices(fs_devices); + close_fs_devices(fs_devices); if (!fs_devices->opened) { seed_devices = fs_devices->seed; fs_devices->seed = NULL; @@ -1196,7 +1193,6 @@ int btrfs_close_devices(struct btrfs_fs_devices *fs_devices) close_fs_devices(fs_devices); free_fs_devices(fs_devices); } - return ret; } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 302c9234f7d0..d0e93e01d60b 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -435,7 +435,7 @@ int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags, void *holder); int btrfs_forget_devices(const char *path); -int btrfs_close_devices(struct btrfs_fs_devices *fs_devices); +void btrfs_close_devices(struct btrfs_fs_devices *fs_devices); void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step); void btrfs_assign_next_active_device(struct btrfs_device *device, struct btrfs_device *this_dev); From c4989c2fd0eba6e164e9a29c4a865e57dd644451 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 15 Jul 2020 13:48:49 +0300 Subject: [PATCH 154/271] btrfs: simplify setting/clearing fs_info to btrfs_fs_devices It makes no sense to have sysfs-related routines be responsible for properly initialising the fs_info pointer of struct btrfs_fs_device. Instead this can be streamlined by making it the responsibility of btrfs_init_devices_late to initialize it. That function already initializes fs_info of every individual device in btrfs_fs_devices. As far as clearing it is concerned it makes sense to move it to close_fs_devices. That function is only called when struct btrfs_fs_devices is no longer in use - either for holding seeds or main devices for a mounted filesystem. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 4 ---- fs/btrfs/volumes.c | 20 ++------------------ fs/btrfs/volumes.h | 2 -- 3 files changed, 2 insertions(+), 24 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 81496ac43d09..438a367371c4 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -939,8 +939,6 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; - btrfs_reset_fs_info_ptr(fs_info); - sysfs_remove_link(fsid_kobj, "bdi"); if (fs_info->space_info_kobj) { @@ -1404,8 +1402,6 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - btrfs_set_fs_info_ptr(fs_info); - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); if (error) return error; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index e94a0eeeeb52..91a413b73b64 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1173,6 +1173,7 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->rw_devices); fs_devices->opened = 0; fs_devices->seeding = false; + fs_devices->fs_info = NULL; } void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) @@ -7201,6 +7202,7 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) device->fs_info = fs_info; mutex_unlock(&fs_devices->device_list_mutex); + fs_devices->fs_info = fs_info; fs_devices = fs_devices->seed; } } @@ -7499,24 +7501,6 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) mutex_unlock(&trans->fs_info->chunk_mutex); } -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; - } -} - -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info) -{ - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - while (fs_devices) { - fs_devices->fs_info = NULL; - fs_devices = fs_devices->seed; - } -} - /* * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10. */ diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d0e93e01d60b..524aa5eb5667 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -569,8 +569,6 @@ static inline enum btrfs_raid_types btrfs_bg_flags_to_raid_index(u64 flags) void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); -void btrfs_set_fs_info_ptr(struct btrfs_fs_info *fs_info); -void btrfs_reset_fs_info_ptr(struct btrfs_fs_info *fs_info); bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info, struct btrfs_device *failing_dev); void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, From 944d3f9fac61e24e13a056b25974df3831994f29 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 16 Jul 2020 10:25:33 +0300 Subject: [PATCH 155/271] btrfs: switch seed device to list api While this patch touches a bunch of files the conversion is straighforward. Instead of using the implicit linked list anchored at btrfs_fs_devices::seed the code is switched to using list_for_each_entry. Previous patches in the series already factored out code that processed both main and seed devices so in those cases the factored out functions are called on the main fs_devices and then on every seed dev inside list_for_each_entry. Using list api also allows to simplify deletion from the seed dev list performed in btrfs_rm_device and btrfs_rm_dev_replace_free_srcdev by substituting a while() loop with a simple list_del_init. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 39 ++++++++--------- fs/btrfs/reada.c | 9 ++-- fs/btrfs/volumes.c | 105 +++++++++++++++++++++------------------------ fs/btrfs/volumes.h | 2 +- 4 files changed, 71 insertions(+), 84 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index a496219baa39..84799ef76758 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -545,33 +545,30 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct page *page) static int check_tree_block_fsid(struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; u8 fsid[BTRFS_FSID_SIZE]; - int ret = 1; + u8 *metadata_uuid; read_extent_buffer(eb, fsid, offsetof(struct btrfs_header, fsid), BTRFS_FSID_SIZE); - while (fs_devices) { - u8 *metadata_uuid; + /* + * Checking the incompat flag is only valid for the current fs. For + * seed devices it's forbidden to have their uuid changed so reading + * ->fsid in this case is fine + */ + if (btrfs_fs_incompat(fs_info, METADATA_UUID)) + metadata_uuid = fs_devices->metadata_uuid; + else + metadata_uuid = fs_devices->fsid; - /* - * Checking the incompat flag is only valid for the current - * fs. For seed devices it's forbidden to have their uuid - * changed so reading ->fsid in this case is fine - */ - if (fs_devices == fs_info->fs_devices && - btrfs_fs_incompat(fs_info, METADATA_UUID)) - metadata_uuid = fs_devices->metadata_uuid; - else - metadata_uuid = fs_devices->fsid; + if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) + return 0; - if (!memcmp(fsid, metadata_uuid, BTRFS_FSID_SIZE)) { - ret = 0; - break; - } - fs_devices = fs_devices->seed; - } - return ret; + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + if (!memcmp(fsid, seed_devs->fsid, BTRFS_FSID_SIZE)) + return 0; + + return 1; } static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index a6bfe7ec14cb..e20972230823 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -791,16 +791,13 @@ static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) static void __reada_start_machine(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; int i; u64 enqueued = 0; -again: enqueued += reada_start_for_fsdevs(fs_devices); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) + enqueued += reada_start_for_fsdevs(seed_devs); if (enqueued == 0) return; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 91a413b73b64..2d7c845b8e67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -356,6 +356,7 @@ static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid, INIT_LIST_HEAD(&fs_devs->devices); INIT_LIST_HEAD(&fs_devs->alloc_list); INIT_LIST_HEAD(&fs_devs->fs_list); + INIT_LIST_HEAD(&fs_devs->seed_list); if (fsid) memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE); @@ -1097,14 +1098,13 @@ static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step) { struct btrfs_device *latest_dev = NULL; + struct btrfs_fs_devices *seed_dev; mutex_lock(&uuid_mutex); -again: __btrfs_free_extra_devids(fs_devices, step, &latest_dev); - if (fs_devices->seed) { - fs_devices = fs_devices->seed; - goto again; - } + + list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list) + __btrfs_free_extra_devids(seed_dev, step, &latest_dev); fs_devices->latest_bdev = latest_dev->bdev; @@ -1178,20 +1178,18 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) { - struct btrfs_fs_devices *seed_devices = NULL; + LIST_HEAD(list); + struct btrfs_fs_devices *tmp; mutex_lock(&uuid_mutex); close_fs_devices(fs_devices); - if (!fs_devices->opened) { - seed_devices = fs_devices->seed; - fs_devices->seed = NULL; - } + if (!fs_devices->opened) + list_splice_init(&fs_devices->seed_list, &list); mutex_unlock(&uuid_mutex); - while (seed_devices) { - fs_devices = seed_devices; - seed_devices = fs_devices->seed; + list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); + list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } } @@ -2169,14 +2167,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, btrfs_free_device(device); if (cur_devices->open_devices == 0) { - while (fs_devices) { - if (fs_devices->seed == cur_devices) { - fs_devices->seed = cur_devices->seed; - break; - } - fs_devices = fs_devices->seed; - } - cur_devices->seed = NULL; + list_del_init(&cur_devices->seed_list); close_fs_devices(cur_devices); free_fs_devices(cur_devices); } @@ -2236,8 +2227,6 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) /* if this is no devs we rather delete the fs_devices */ if (!fs_devices->num_devices) { - struct btrfs_fs_devices *tmp_fs_devices; - /* * On a mounted FS, num_devices can't be zero unless it's a * seed. In case of a seed device being replaced, the replace @@ -2246,15 +2235,7 @@ void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) */ ASSERT(fs_devices->seeding); - tmp_fs_devices = fs_info->fs_devices; - while (tmp_fs_devices) { - if (tmp_fs_devices->seed == fs_devices) { - tmp_fs_devices->seed = fs_devices->seed; - break; - } - tmp_fs_devices = tmp_fs_devices->seed; - } - fs_devices->seed = NULL; + list_del_init(&fs_devices->seed_list); close_fs_devices(fs_devices); free_fs_devices(fs_devices); } @@ -2409,7 +2390,7 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) fs_devices->open_devices = 0; fs_devices->missing_devices = 0; fs_devices->rotating = false; - fs_devices->seed = seed_devices; + list_add(&seed_devices->seed_list, &fs_devices->seed_list); generate_random_uuid(fs_devices->fsid); memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE); @@ -6465,11 +6446,21 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, bool seed) { struct btrfs_device *device; + struct btrfs_fs_devices *seed_devs; - while (fs_devices) { + if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &fs_devices->devices, dev_list) { + if (device->devid == devid && + (!uuid || memcmp(device->uuid, uuid, + BTRFS_UUID_SIZE) == 0)) + return device; + } + } + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { if (!fsid || - !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { - list_for_each_entry(device, &fs_devices->devices, + !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { if (device->devid == devid && (!uuid || memcmp(device->uuid, uuid, @@ -6477,11 +6468,8 @@ struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices, return device; } } - if (seed) - fs_devices = fs_devices->seed; - else - return NULL; } + return NULL; } @@ -6732,13 +6720,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); - fs_devices = fs_info->fs_devices->seed; - while (fs_devices) { + list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; - fs_devices = fs_devices->seed; - } fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { @@ -6772,8 +6757,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, goto out; } - fs_devices->seed = fs_info->fs_devices->seed; - fs_info->fs_devices->seed = fs_devices; + list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); out: return fs_devices; } @@ -7193,17 +7177,23 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) { - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; - while (fs_devices) { - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); + fs_devices->fs_info = fs_info; - fs_devices->fs_info = fs_info; - fs_devices = fs_devices->seed; + mutex_lock(&fs_devices->device_list_mutex); + list_for_each_entry(device, &fs_devices->devices, dev_list) + device->fs_info = fs_info; + mutex_unlock(&fs_devices->device_list_mutex); + + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + mutex_lock(&seed_devs->device_list_mutex); + list_for_each_entry(device, &seed_devs->devices, dev_list) + device->fs_info = fs_info; + mutex_unlock(&seed_devs->device_list_mutex); + + seed_devs->fs_info = fs_info; } } @@ -7581,8 +7571,11 @@ static int verify_one_dev_extent(struct btrfs_fs_info *fs_info, /* It's possible this device is a dummy for seed device */ if (dev->disk_total_bytes == 0) { - dev = btrfs_find_device(fs_info->fs_devices->seed, devid, NULL, - NULL, false); + struct btrfs_fs_devices *devs; + + devs = list_first_entry(&fs_info->fs_devices->seed_list, + struct btrfs_fs_devices, seed_list); + dev = btrfs_find_device(devs, devid, NULL, NULL, false); if (!dev) { btrfs_err(fs_info, "failed to find seed devid %llu", devid); diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 524aa5eb5667..48bdca01e237 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -246,7 +246,7 @@ struct btrfs_fs_devices { */ struct list_head alloc_list; - struct btrfs_fs_devices *seed; + struct list_head seed_list; bool seeding; int opened; From 427c8fddb1296dd013d8af5a4957d9d88fb8ab4e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 17:04:36 +0300 Subject: [PATCH 156/271] btrfs: document some invariants of seed code Without good understanding of how seed devices works it's hard to grok some of what the code in open_seed_devices or btrfs_prepare_sprout does. Add comments hopefully reducing some of the cognitive load. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2d7c845b8e67..7a39f3aebc53 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2357,10 +2357,20 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) if (!fs_devices->seeding) return -EINVAL; + /* + * Private copy of the seed devices, anchored at + * fs_info->fs_devices->seed_list + */ seed_devices = alloc_fs_devices(NULL, NULL); if (IS_ERR(seed_devices)) return PTR_ERR(seed_devices); + /* + * It's necessary to retain a copy of the original seed fs_devices in + * fs_uuids so that filesystems which have been seeded can successfully + * reference the seed device from open_seed_devices. This also supports + * multiple fs seed. + */ old_devices = clone_fs_devices(fs_devices); if (IS_ERR(old_devices)) { kfree(seed_devices); @@ -6720,6 +6730,7 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, lockdep_assert_held(&uuid_mutex); ASSERT(fsid); + /* This will match only for multi-device seed fs */ list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list) if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE)) return fs_devices; @@ -6739,6 +6750,10 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, return fs_devices; } + /* + * Upon first call for a seed fs fsid, just create a private copy of the + * respective fs_devices and anchor it at fs_info->fs_devices->seed_list + */ fs_devices = clone_fs_devices(fs_devices); if (IS_ERR(fs_devices)) return fs_devices; From 68abf360160ca085bb5109d35692ea0f9581f8d1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 16:26:46 +0300 Subject: [PATCH 157/271] btrfs: remove alloc_list splice in btrfs_prepare_sprout btrfs_prepare_sprout is called when the first rw device is added to a seed filesystem. This means the filesystem can't have its alloc_list be non-empty, since seed filesystems are read only. Simply remove the code altogether. Reviewed-by: Josef Bacik Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 7a39f3aebc53..9800e966ef6e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2391,10 +2391,6 @@ static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info) list_for_each_entry(device, &seed_devices->devices, dev_list) device->fs_devices = seed_devices; - mutex_lock(&fs_info->chunk_mutex); - list_splice_init(&fs_devices->alloc_list, &seed_devices->alloc_list); - mutex_unlock(&fs_info->chunk_mutex); - fs_devices->seeding = false; fs_devices->num_devices = 0; fs_devices->open_devices = 0; From 62cf5391209ac7b258a82316c4d703342863fd37 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:18:27 -0400 Subject: [PATCH 158/271] btrfs: move btrfs_rm_dev_replace_free_srcdev outside of all locks When closing and freeing the source device we could end up doing our final blkdev_put() on the bdev, which will grab the bd_mutex. As such we want to be holding as few locks as possible, so move this call outside of the dev_replace->lock_finishing_cancel_unmount lock. Since we're modifying the fs_devices we need to make sure we're holding the uuid_mutex here, so take that as well. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 1 - 1 file changed, 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9800e966ef6e..503ccc3a091e 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2216,7 +2216,6 @@ void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev) void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev) { - struct btrfs_fs_info *fs_info = srcdev->fs_info; struct btrfs_fs_devices *fs_devices = srcdev->fs_devices; mutex_lock(&uuid_mutex); From 425c6ed6486f1e2ed892eadb94fd4829c299493e Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Mon, 10 Aug 2020 11:42:28 -0400 Subject: [PATCH 159/271] btrfs: do not hold device_list_mutex when closing devices The following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.8.0-rc7-00169-g87212851a027-dirty #929 Not tainted ------------------------------------------------------ fsstress/8739 is trying to acquire lock: ffff88bfd0eb0c90 (&fs_info->reloc_mutex){+.+.}-{3:3}, at: btrfs_record_root_in_trans+0x43/0x70 but task is already holding lock: ffff88bfbd16e538 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x6a/0x4a0 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #10 (sb_pagefaults){.+.+}-{0:0}: __sb_start_write+0x129/0x210 btrfs_page_mkwrite+0x6a/0x4a0 do_page_mkwrite+0x4d/0xc0 handle_mm_fault+0x103c/0x1730 exc_page_fault+0x340/0x660 asm_exc_page_fault+0x1e/0x30 -> #9 (&mm->mmap_lock#2){++++}-{3:3}: __might_fault+0x68/0x90 _copy_to_user+0x1e/0x80 perf_read+0x141/0x2c0 vfs_read+0xad/0x1b0 ksys_read+0x5f/0xe0 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #8 (&cpuctx_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x88/0x150 perf_event_init+0x1db/0x20b start_kernel+0x3ae/0x53c secondary_startup_64+0xa4/0xb0 -> #7 (pmus_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 perf_event_init_cpu+0x4f/0x150 cpuhp_invoke_callback+0xb1/0x900 _cpu_up.constprop.26+0x9f/0x130 cpu_up+0x7b/0xc0 bringup_nonboot_cpus+0x4f/0x60 smp_init+0x26/0x71 kernel_init_freeable+0x110/0x258 kernel_init+0xa/0x103 ret_from_fork+0x1f/0x30 -> #6 (cpu_hotplug_lock){++++}-{0:0}: cpus_read_lock+0x39/0xb0 kmem_cache_create_usercopy+0x28/0x230 kmem_cache_create+0x12/0x20 bioset_init+0x15e/0x2b0 init_bio+0xa3/0xaa do_one_initcall+0x5a/0x2e0 kernel_init_freeable+0x1f4/0x258 kernel_init+0xa/0x103 ret_from_fork+0x1f/0x30 -> #5 (bio_slab_lock){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 bioset_init+0xbc/0x2b0 __blk_alloc_queue+0x6f/0x2d0 blk_mq_init_queue_data+0x1b/0x70 loop_add+0x110/0x290 [loop] fq_codel_tcf_block+0x12/0x20 [sch_fq_codel] do_one_initcall+0x5a/0x2e0 do_init_module+0x5a/0x220 load_module+0x2459/0x26e0 __do_sys_finit_module+0xba/0xe0 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #4 (loop_ctl_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 lo_open+0x18/0x50 [loop] __blkdev_get+0xec/0x570 blkdev_get+0xe8/0x150 do_dentry_open+0x167/0x410 path_openat+0x7c9/0xa80 do_filp_open+0x93/0x100 do_sys_openat2+0x22a/0x2e0 do_sys_open+0x4b/0x80 do_syscall_64+0x50/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #3 (&bdev->bd_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 blkdev_put+0x1d/0x120 close_fs_devices.part.31+0x84/0x130 btrfs_close_devices+0x44/0xb0 close_ctree+0x296/0x2b2 generic_shutdown_super+0x69/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (&fs_devs->device_list_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_run_dev_stats+0x49/0x480 commit_cowonly_roots+0xb5/0x2a0 btrfs_commit_transaction+0x516/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->tree_log_mutex){+.+.}-{3:3}: __mutex_lock+0x9f/0x930 btrfs_commit_transaction+0x4bb/0xa60 sync_filesystem+0x6b/0x90 generic_shutdown_super+0x22/0x100 kill_anon_super+0xe/0x30 btrfs_kill_super+0x12/0x20 deactivate_locked_super+0x29/0x60 cleanup_mnt+0xb8/0x140 task_work_run+0x6d/0xb0 __prepare_exit_to_usermode+0x1cc/0x1e0 do_syscall_64+0x5c/0x90 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&fs_info->reloc_mutex){+.+.}-{3:3}: __lock_acquire+0x1272/0x2310 lock_acquire+0x9e/0x360 __mutex_lock+0x9f/0x930 btrfs_record_root_in_trans+0x43/0x70 start_transaction+0xd1/0x5d0 btrfs_dirty_inode+0x42/0xd0 file_update_time+0xc8/0x110 btrfs_page_mkwrite+0x10c/0x4a0 do_page_mkwrite+0x4d/0xc0 handle_mm_fault+0x103c/0x1730 exc_page_fault+0x340/0x660 asm_exc_page_fault+0x1e/0x30 other info that might help us debug this: Chain exists of: &fs_info->reloc_mutex --> &mm->mmap_lock#2 --> sb_pagefaults Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(sb_pagefaults); lock(&mm->mmap_lock#2); lock(sb_pagefaults); lock(&fs_info->reloc_mutex); *** DEADLOCK *** 3 locks held by fsstress/8739: #0: ffff88bee66eeb68 (&mm->mmap_lock#2){++++}-{3:3}, at: exc_page_fault+0x173/0x660 #1: ffff88bfbd16e538 (sb_pagefaults){.+.+}-{0:0}, at: btrfs_page_mkwrite+0x6a/0x4a0 #2: ffff88bfbd16e630 (sb_internal){.+.+}-{0:0}, at: start_transaction+0x3da/0x5d0 stack backtrace: CPU: 17 PID: 8739 Comm: fsstress Kdump: loaded Not tainted 5.8.0-rc7-00169-g87212851a027-dirty #929 Hardware name: Quanta Tioga Pass Single Side 01-0030993006/Tioga Pass Single Side, BIOS F08_3A18 12/20/2018 Call Trace: dump_stack+0x78/0xa0 check_noncircular+0x165/0x180 __lock_acquire+0x1272/0x2310 ? btrfs_get_alloc_profile+0x150/0x210 lock_acquire+0x9e/0x360 ? btrfs_record_root_in_trans+0x43/0x70 __mutex_lock+0x9f/0x930 ? btrfs_record_root_in_trans+0x43/0x70 ? lock_acquire+0x9e/0x360 ? join_transaction+0x5d/0x450 ? find_held_lock+0x2d/0x90 ? btrfs_record_root_in_trans+0x43/0x70 ? join_transaction+0x3d5/0x450 ? btrfs_record_root_in_trans+0x43/0x70 btrfs_record_root_in_trans+0x43/0x70 start_transaction+0xd1/0x5d0 btrfs_dirty_inode+0x42/0xd0 file_update_time+0xc8/0x110 btrfs_page_mkwrite+0x10c/0x4a0 ? handle_mm_fault+0x5e/0x1730 do_page_mkwrite+0x4d/0xc0 ? __do_fault+0x32/0x150 handle_mm_fault+0x103c/0x1730 exc_page_fault+0x340/0x660 ? asm_exc_page_fault+0x8/0x30 asm_exc_page_fault+0x1e/0x30 RIP: 0033:0x7faa6c9969c4 Was seen in testing. The fix is similar to that of btrfs: open device without device_list_mutex where we're holding the device_list_mutex and then grab the bd_mutex, which pulls in a bunch of dependencies under the bd_mutex. We only ever call btrfs_close_devices() on mount failure or unmount, so we're save to not have the device_list_mutex here. We're already holding the uuid_mutex which keeps us safe from any external modification of the fs_devices. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 503ccc3a091e..29c6f0154c62 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1160,14 +1160,13 @@ static void close_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device, *tmp; + lockdep_assert_held(&uuid_mutex); + if (--fs_devices->opened > 0) return; - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) { + list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list) btrfs_close_one_device(device); - } - mutex_unlock(&fs_devices->device_list_mutex); WARN_ON(fs_devices->open_devices); WARN_ON(fs_devices->rw_devices); @@ -1185,13 +1184,13 @@ void btrfs_close_devices(struct btrfs_fs_devices *fs_devices) close_fs_devices(fs_devices); if (!fs_devices->opened) list_splice_init(&fs_devices->seed_list, &list); - mutex_unlock(&uuid_mutex); list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) { close_fs_devices(fs_devices); list_del(&fs_devices->seed_list); free_fs_devices(fs_devices); } + mutex_unlock(&uuid_mutex); } static int open_fs_devices(struct btrfs_fs_devices *fs_devices, From b4c5d8fdfff3e2b6c4fa4a5043e8946dff500f8c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 24 Jul 2020 14:46:09 +0800 Subject: [PATCH 160/271] btrfs: qgroup: fix wrong qgroup metadata reserve for delayed inode For delayed inode facility, qgroup metadata is reserved for it, and later freed. However we're freeing more bytes than we reserved. In btrfs_delayed_inode_reserve_metadata(): num_bytes = btrfs_calc_metadata_size(fs_info, 1); ... ret = btrfs_qgroup_reserve_meta_prealloc(root, fs_info->nodesize, true); ... if (!ret) { node->bytes_reserved = num_bytes; But in btrfs_delayed_inode_release_metadata(): if (qgroup_free) btrfs_qgroup_free_meta_prealloc(node->root, node->bytes_reserved); else btrfs_qgroup_convert_reserved_meta(node->root, node->bytes_reserved); This means, we're always releasing more qgroup metadata rsv than we have reserved. This won't trigger selftest warning, as btrfs qgroup metadata rsv has extra protection against cases like quota enabled half-way. But we still need to fix this problem any way. This patch will use the same num_bytes for qgroup metadata rsv so we could handle it correctly. Fixes: f218ea6c4792 ("btrfs: delayed-inode: Remove wrong qgroup meta reservation calls") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/delayed-inode.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf1595a42a98..0727b10a9a89 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -627,8 +627,7 @@ static int btrfs_delayed_inode_reserve_metadata( */ if (!src_rsv || (!trans->bytes_reserved && src_rsv->type != BTRFS_BLOCK_RSV_DELALLOC)) { - ret = btrfs_qgroup_reserve_meta_prealloc(root, - fs_info->nodesize, true); + ret = btrfs_qgroup_reserve_meta_prealloc(root, num_bytes, true); if (ret < 0) return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, From e85fde5162bf1b242cbd6daf7dba0f9b457d592b Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Fri, 24 Jul 2020 14:46:10 +0800 Subject: [PATCH 161/271] btrfs: qgroup: fix qgroup meta rsv leak for subvolume operations [BUG] When quota is enabled for TEST_DEV, generic/013 sometimes fails like this: generic/013 14s ... _check_dmesg: something found in dmesg (see xfstests-dev/results//generic/013.dmesg) And with the following metadata leak: BTRFS warning (device dm-3): qgroup 0/1370 has unreleased space, type 2 rsv 49152 ------------[ cut here ]------------ WARNING: CPU: 2 PID: 47912 at fs/btrfs/disk-io.c:4078 close_ctree+0x1dc/0x323 [btrfs] Call Trace: btrfs_put_super+0x15/0x17 [btrfs] generic_shutdown_super+0x72/0x110 kill_anon_super+0x18/0x30 btrfs_kill_super+0x17/0x30 [btrfs] deactivate_locked_super+0x3b/0xa0 deactivate_super+0x40/0x50 cleanup_mnt+0x135/0x190 __cleanup_mnt+0x12/0x20 task_work_run+0x64/0xb0 __prepare_exit_to_usermode+0x1bc/0x1c0 __syscall_return_slowpath+0x47/0x230 do_syscall_64+0x64/0xb0 entry_SYSCALL_64_after_hwframe+0x44/0xa9 ---[ end trace a6cfd45ba80e4e06 ]--- BTRFS error (device dm-3): qgroup reserved space leaked BTRFS info (device dm-3): disk space caching is enabled BTRFS info (device dm-3): has skinny extents [CAUSE] The qgroup preallocated meta rsv operations of that offending root are: btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072 btrfs_delayed_inode_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=131072 btrfs_subvolume_reserve_metadata: rsv_meta_prealloc root=1370 num_bytes=49152 btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072 btrfs_delayed_inode_release_metadata: convert_meta_prealloc root=1370 num_bytes=-131072 It's pretty obvious that, we reserve qgroup meta rsv in btrfs_subvolume_reserve_metadata(), but doesn't have corresponding release/convert calls in btrfs_subvolume_release_metadata(). This leads to the leakage. [FIX] To fix this bug, we should follow what we're doing in btrfs_delalloc_reserve_metadata(), where we reserve qgroup space, and add it to block_rsv->qgroup_rsv_reserved. And free the qgroup reserved metadata space when releasing the block_rsv. To do this, we need to change the btrfs_subvolume_release_metadata() to accept btrfs_root, and record the qgroup_to_release number, and call btrfs_qgroup_convert_reserved_meta() for it. Fixes: 733e03a0b26a ("btrfs: qgroup: Split meta rsv type into meta_prealloc and meta_pertrans") CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/inode.c | 2 +- fs/btrfs/ioctl.c | 6 +++--- fs/btrfs/root-tree.c | 13 +++++++++++-- 4 files changed, 16 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index eb7adc069926..f9d4e0958e2e 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2622,7 +2622,7 @@ enum btrfs_flush_state { int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv, int nitems, bool use_global_rsv); -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a50a40f8bef2..123521aa5595 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4051,7 +4051,7 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) err = ret; inode->i_flags |= S_DEAD; out_release: - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); out_up_write: up_write(&fs_info->subvol_sem); if (err) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index a5355a16eabb..3779a6c12184 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -618,7 +618,7 @@ static noinline int create_subvol(struct inode *dir, trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); goto fail_free; } trans->block_rsv = &block_rsv; @@ -742,7 +742,7 @@ static noinline int create_subvol(struct inode *dir, kfree(root_item); trans->block_rsv = NULL; trans->bytes_reserved = 0; - btrfs_subvolume_release_metadata(fs_info, &block_rsv); + btrfs_subvolume_release_metadata(root, &block_rsv); err = btrfs_commit_transaction(trans); if (err && !ret) @@ -856,7 +856,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir, if (ret && pending_snapshot->snap) pending_snapshot->snap->anon_dev = 0; btrfs_put_root(pending_snapshot->snap); - btrfs_subvolume_release_metadata(fs_info, &pending_snapshot->block_rsv); + btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv); free_pending: if (pending_snapshot->anon_dev) free_anon_bdev(pending_snapshot->anon_dev); diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c index c89697486366..702dc5441f03 100644 --- a/fs/btrfs/root-tree.c +++ b/fs/btrfs/root-tree.c @@ -512,11 +512,20 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root, if (ret && qgroup_num_bytes) btrfs_qgroup_free_meta_prealloc(root, qgroup_num_bytes); + if (!ret) { + spin_lock(&rsv->lock); + rsv->qgroup_rsv_reserved += qgroup_num_bytes; + spin_unlock(&rsv->lock); + } return ret; } -void btrfs_subvolume_release_metadata(struct btrfs_fs_info *fs_info, +void btrfs_subvolume_release_metadata(struct btrfs_root *root, struct btrfs_block_rsv *rsv) { - btrfs_block_rsv_release(fs_info, rsv, (u64)-1, NULL); + struct btrfs_fs_info *fs_info = root->fs_info; + u64 qgroup_to_release; + + btrfs_block_rsv_release(fs_info, rsv, (u64)-1, &qgroup_to_release); + btrfs_qgroup_convert_reserved_meta(root, qgroup_to_release); } From 217f5004fee678bf238fbbc7e7a01c55df92e732 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 12 Aug 2020 16:16:35 +0300 Subject: [PATCH 162/271] btrfs: rework error detection in init_tree_roots To avoid duplicating 3 lines of code the error detection logic in init_tree_roots is somewhat quirky. It first checks for the presence of any error condition, then checks for the specific condition to perform any specific actions. That's spurious because directly checking for each respective error condition and doing the necessary steps is more obvious. While at it change the -EUCLEAN to -EIO in case the extent buffer is not read correctly, this is in line with other sites which return -EIO when the eb couldn't be read. Additionally it results in smaller code and the code reads more linearly: add/remove: 0/0 grow/shrink: 0/1 up/down: 0/-95 (-95) Function old new delta open_ctree 17243 17148 -95 Total: Before=113104, After=113009, chg -0.08% Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 84799ef76758..4eb56f00ee81 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2623,18 +2623,17 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) level = btrfs_super_root_level(sb); tree_root->node = read_tree_block(fs_info, btrfs_super_root(sb), generation, level, NULL); - if (IS_ERR(tree_root->node) || - !extent_buffer_uptodate(tree_root->node)) { + if (IS_ERR(tree_root->node)) { handle_error = true; + ret = PTR_ERR(tree_root->node); + tree_root->node = NULL; + btrfs_warn(fs_info, "couldn't read tree root"); + continue; - if (IS_ERR(tree_root->node)) { - ret = PTR_ERR(tree_root->node); - tree_root->node = NULL; - } else if (!extent_buffer_uptodate(tree_root->node)) { - ret = -EUCLEAN; - } - - btrfs_warn(fs_info, "failed to read tree root"); + } else if (!extent_buffer_uptodate(tree_root->node)) { + handle_error = true; + ret = -EIO; + btrfs_warn(fs_info, "error while reading tree root"); continue; } From f98b6215d7d1cda6ac552b7bc58a6ad092aa517c Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:47 +0800 Subject: [PATCH 163/271] btrfs: extent_io: do extra check for extent buffer read write functions Although we have start, len check for extent buffer reader/write (e.g. read_extent_buffer()), these checks have limitations: - No overflow check Values like start = 1024 len = -1024 can still pass the basic (start + len) > eb->len check. - Checks are not consistent For read_extent_buffer() we only check (start + len) against eb->len. While for memcmp_extent_buffer() we also check start against eb->len. - Different error reporting mechanism We use WARN() in read_extent_buffer() but BUG() in memcpy_extent_buffer(). - Still modify memory if the request is obviously wrong In read_extent_buffer() even we find (start + len) > eb->len, we still call memset(dst, 0, len), which can easily cause memory access error if start + len overflows. To address above problems, this patch creates a new common function to check such access, check_eb_range(). - Add overflow check This function checks start, start + len against eb->len and overflow check. - Unified checks - Unified error reports Will call WARN() if CONFIG_BTRFS_DEBUG is configured. And also do btrfs_warn() message for non-debug build. - Exit ASAP if check fails No more possible memory corruption. - Add extra comment for @start @len used in those functions as it's sometimes confused with the logical addressing instead of a range inside the eb space Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202817 [ Inspired by above report, the report itself is already addressed ] Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba [ use check_add_overflow ] Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 84 +++++++++++++++++++++++++------------------- 1 file changed, 47 insertions(+), 37 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0320ddb0133e..a28d442d65b5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5622,6 +5622,36 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) return ret; } +static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, + unsigned long len) +{ + btrfs_warn(eb->fs_info, + "access to eb bytenr %llu len %lu out of range start %lu len %lu", + eb->start, eb->len, start, len); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + + return true; +} + +/* + * Check if the [start, start + len) range is valid before reading/writing + * the eb. + * NOTE: @start and @len are offset inside the eb, not logical address. + * + * Caller should not touch the dst/src memory if this function returns error. + */ +static inline int check_eb_range(const struct extent_buffer *eb, + unsigned long start, unsigned long len) +{ + unsigned long offset; + + /* start, start + len should not go beyond eb->len nor overflow */ + if (unlikely(check_add_overflow(start, len, &offset) || offset > eb->len)) + return report_eb_range(eb, start, len); + + return false; +} + void read_extent_buffer(const struct extent_buffer *eb, void *dstv, unsigned long start, unsigned long len) { @@ -5632,12 +5662,8 @@ void read_extent_buffer(const struct extent_buffer *eb, void *dstv, char *dst = (char *)dstv; unsigned long i = start >> PAGE_SHIFT; - if (start + len > eb->len) { - WARN(1, KERN_ERR "btrfs bad mapping eb start %llu len %lu, wanted %lu %lu\n", - eb->start, eb->len, start, len); - memset(dst, 0, len); + if (check_eb_range(eb, start, len)) return; - } offset = offset_in_page(start); @@ -5702,8 +5728,8 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, unsigned long i = start >> PAGE_SHIFT; int ret = 0; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return -EINVAL; offset = offset_in_page(start); @@ -5756,8 +5782,8 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, char *src = (char *)srcv; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5785,8 +5811,8 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, char *kaddr; unsigned long i = start >> PAGE_SHIFT; - WARN_ON(start > eb->len); - WARN_ON(start + len > eb->start + eb->len); + if (check_eb_range(eb, start, len)) + return; offset = offset_in_page(start); @@ -5830,6 +5856,10 @@ void copy_extent_buffer(const struct extent_buffer *dst, char *kaddr; unsigned long i = dst_offset >> PAGE_SHIFT; + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(src, src_offset, len)) + return; + WARN_ON(src->len != dst_len); offset = offset_in_page(dst_offset); @@ -6019,25 +6049,15 @@ void memcpy_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu dst len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu dst len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; while (len > 0) { dst_off_in_page = offset_in_page(dst_offset); @@ -6064,7 +6084,6 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_offset, unsigned long src_offset, unsigned long len) { - struct btrfs_fs_info *fs_info = dst->fs_info; size_t cur; size_t dst_off_in_page; size_t src_off_in_page; @@ -6073,18 +6092,9 @@ void memmove_extent_buffer(const struct extent_buffer *dst, unsigned long dst_i; unsigned long src_i; - if (src_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus src_offset %lu move len %lu len %lu", - src_offset, len, dst->len); - BUG(); - } - if (dst_offset + len > dst->len) { - btrfs_err(fs_info, - "memmove bogus dst_offset %lu move len %lu len %lu", - dst_offset, len, dst->len); - BUG(); - } + if (check_eb_range(dst, dst_offset, len) || + check_eb_range(dst, src_offset, len)) + return; if (dst_offset < src_offset) { memcpy_extent_buffer(dst, dst_offset, src_offset, len); return; From 1c2a07f598d526e39acbf1837b8d521fc0dab9c5 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:48 +0800 Subject: [PATCH 164/271] btrfs: extent-tree: kill BUG_ON() in __btrfs_free_extent() __btrfs_free_extent() is doing two things: 1. Reduce the refs number of an extent backref Either it's an inline extent backref (inside EXTENT/METADATA item) or a keyed extent backref (SHARED_* item). We only need to locate that backref line, either reduce the number or remove the backref line completely. 2. Update the refs count in EXTENT/METADATA_ITEM During step 1), we will try to locate the EXTENT/METADATA_ITEM without triggering another btrfs_search_slot() as fast path. Only when we fail to locate that item, we will trigger another btrfs_search_slot() to get that EXTENT/METADATA_ITEM after we updated/deleted the backref line. And we have a lot of strict checks on things like refs_to_drop against extent refs and special case checks for single ref extents. There are 7 BUG_ON()s, although they're doing correct checks, they can be triggered by crafted images. This patch improves the function: - Introduce two examples to show what __btrfs_free_extent() is doing One inline backref case and one keyed case. Should cover most cases. - Kill all BUG_ON()s with proper error message and optional leaf dump - Add comment to show the overall flow Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202819 [ The report triggers one BUG_ON() in __btrfs_free_extent() ] Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 160 +++++++++++++++++++++++++++++++++++++---- 1 file changed, 147 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 79c4432e6a3f..40e5b41e91cc 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -2934,6 +2934,65 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) return 0; } +/* + * Drop one or more refs of @node. + * + * 1. Locate the extent refs. + * It's either inline in EXTENT/METADATA_ITEM or in keyed SHARED_* item. + * Locate it, then reduce the refs number or remove the ref line completely. + * + * 2. Update the refs count in EXTENT/METADATA_ITEM + * + * Inline backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 2 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * extent data backref root FS_TREE objectid 257 offset 0 count 1 + * + * This function gets called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 257 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 16201 itemsize 82 + * refs 1 gen 6 flags DATA + * extent data backref root FS_TREE objectid 258 offset 0 count 1 + * + * Keyed backref case: + * + * in extent tree we have: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 754 gen 6 flags DATA + * [...] + * item 2 key (13631488 EXTENT_DATA_REF ) itemoff 3915 itemsize 28 + * extent data backref root FS_TREE objectid 866 offset 0 count 1 + * + * This function get called with: + * + * node->bytenr = 13631488 + * node->num_bytes = 1048576 + * root_objectid = FS_TREE + * owner_objectid = 866 + * owner_offset = 0 + * refs_to_drop = 1 + * + * Then we should get some like: + * + * item 0 key (13631488 EXTENT_ITEM 1048576) itemoff 3971 itemsize 24 + * refs 753 gen 6 flags DATA + * + * And that (13631488 EXTENT_DATA_REF ) gets removed. + */ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, u64 parent, u64 root_objectid, u64 owner_objectid, @@ -2966,7 +3025,15 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, path->leave_spinning = 1; is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID; - BUG_ON(!is_data && refs_to_drop != 1); + + if (!is_data && refs_to_drop != 1) { + btrfs_crit(info, +"invalid refs_to_drop, dropping more than 1 refs for tree block %llu refs_to_drop %u", + node->bytenr, refs_to_drop); + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } if (is_data) skinny_metadata = false; @@ -2975,6 +3042,13 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, parent, root_objectid, owner_objectid, owner_offset); if (ret == 0) { + /* + * Either the inline backref or the SHARED_DATA_REF/ + * SHARED_BLOCK_REF is found + * + * Here is a quick path to locate EXTENT/METADATA_ITEM. + * It's possible the EXTENT/METADATA_ITEM is near current slot. + */ extent_slot = path->slots[0]; while (extent_slot >= 0) { btrfs_item_key_to_cpu(path->nodes[0], &key, @@ -2991,13 +3065,21 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, found_extent = 1; break; } + + /* Quick path didn't find the EXTEMT/METADATA_ITEM */ if (path->slots[0] - extent_slot > 5) break; extent_slot--; } if (!found_extent) { - BUG_ON(iref); + if (iref) { + btrfs_crit(info, +"invalid iref, no EXTENT/METADATA_ITEM found but has inline extent ref"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } + /* Must be SHARED_* item, remove the backref first */ ret = remove_extent_backref(trans, path, NULL, refs_to_drop, is_data, &last_ref); @@ -3008,6 +3090,7 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, btrfs_release_path(path); path->leave_spinning = 1; + /* Slow path to locate EXTENT/METADATA_ITEM */ key.objectid = bytenr; key.type = BTRFS_EXTENT_ITEM_KEY; key.offset = num_bytes; @@ -3082,19 +3165,26 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID && key.type == BTRFS_EXTENT_ITEM_KEY) { struct btrfs_tree_block_info *bi; - BUG_ON(item_size < sizeof(*ei) + sizeof(*bi)); + if (item_size < sizeof(*ei) + sizeof(*bi)) { + btrfs_crit(info, +"invalid extent item size for key (%llu, %u, %llu) owner %llu, has %u expect >= %lu", + key.objectid, key.type, key.offset, + owner_objectid, item_size, + sizeof(*ei) + sizeof(*bi)); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } bi = (struct btrfs_tree_block_info *)(ei + 1); WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi)); } refs = btrfs_extent_refs(leaf, ei); if (refs < refs_to_drop) { - btrfs_err(info, - "trying to drop %d refs but we only have %Lu for bytenr %Lu", + btrfs_crit(info, + "trying to drop %d refs but we only have %llu for bytenr %llu", refs_to_drop, refs, bytenr); - ret = -EINVAL; - btrfs_abort_transaction(trans, ret); - goto out; + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; } refs -= refs_to_drop; @@ -3106,7 +3196,12 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, * be updated by remove_extent_backref */ if (iref) { - BUG_ON(!found_extent); + if (!found_extent) { + btrfs_crit(info, +"invalid iref, got inlined extent ref but no EXTENT/METADATA_ITEM found"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { btrfs_set_extent_refs(leaf, ei, refs); btrfs_mark_buffer_dirty(leaf); @@ -3121,13 +3216,39 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, } } } else { + /* In this branch refs == 1 */ if (found_extent) { - BUG_ON(is_data && refs_to_drop != - extent_data_ref_count(path, iref)); + if (is_data && refs_to_drop != + extent_data_ref_count(path, iref)) { + btrfs_crit(info, + "invalid refs_to_drop, current refs %u refs_to_drop %u", + extent_data_ref_count(path, iref), + refs_to_drop); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } if (iref) { - BUG_ON(path->slots[0] != extent_slot); + if (path->slots[0] != extent_slot) { + btrfs_crit(info, +"invalid iref, extent item key (%llu %u %llu) doesn't have wanted iref", + key.objectid, key.type, + key.offset); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } } else { - BUG_ON(path->slots[0] != extent_slot + 1); + /* + * No inline ref, we must be at SHARED_* item, + * And it's single ref, it must be: + * | extent_slot ||extent_slot + 1| + * [ EXTENT/METADATA_ITEM ][ SHARED_* ITEM ] + */ + if (path->slots[0] != extent_slot + 1) { + btrfs_crit(info, + "invalid SHARED_* item, previous item is not EXTENT/METADATA_ITEM"); + btrfs_abort_transaction(trans, -EUCLEAN); + goto err_dump; + } path->slots[0] = extent_slot; num_to_del = 2; } @@ -3168,6 +3289,19 @@ static int __btrfs_free_extent(struct btrfs_trans_handle *trans, out: btrfs_free_path(path); return ret; +err_dump: + /* + * Leaf dump can take up a lot of log buffer, so we only do full leaf + * dump for debug build. + */ + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + btrfs_crit(info, "path->slots[0]=%d extent_slot=%d", + path->slots[0], extent_slot); + btrfs_print_leaf(path->nodes[0]); + } + + btrfs_free_path(path); + return -EUCLEAN; } /* From 07cce5cf3b489419aa8e87f48a55f4e190a30876 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:49 +0800 Subject: [PATCH 165/271] btrfs: extent-tree: kill the BUG_ON() in insert_inline_extent_backref() [BUG] With a crafted image, btrfs can panic at insert_inline_extent_backref(): kernel BUG at fs/btrfs/extent-tree.c:1857! invalid opcode: 0000 [#1] SMP PTI CPU: 0 PID: 1117 Comm: btrfs-transacti Not tainted 5.0.0-rc8+ #9 RIP: 0010:insert_inline_extent_backref+0xcc/0xe0 RSP: 0018:ffffac4dc1287be8 EFLAGS: 00010293 RAX: 0000000000000000 RBX: 0000000000000007 RCX: 0000000000000001 RDX: 0000000000001000 RSI: 0000000000000000 RDI: 0000000000000000 RBP: ffffac4dc1287c28 R08: ffffac4dc1287ab8 R09: ffffac4dc1287ac0 R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 R13: ffff8febef88a540 R14: ffff8febeaa7bc30 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff8febf7a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f663ace94c0 CR3: 0000000235698006 CR4: 00000000000206f0 Call Trace: ? _cond_resched+0x1a/0x50 __btrfs_inc_extent_ref.isra.64+0x7e/0x240 ? btrfs_merge_delayed_refs+0xa5/0x330 __btrfs_run_delayed_refs+0x653/0x1120 btrfs_run_delayed_refs+0xdb/0x1b0 btrfs_commit_transaction+0x52/0x950 ? start_transaction+0x94/0x450 transaction_kthread+0x163/0x190 kthread+0x105/0x140 ? btrfs_cleanup_transaction+0x560/0x560 ? kthread_destroy_worker+0x50/0x50 ret_from_fork+0x35/0x40 Modules linked in: ---[ end trace 2ad8b3de903cf825 ]--- [CAUSE] Due to extent tree corruption (still valid by itself, but bad cross ref), we can allocate an extent which is still in extent tree. The offending tree block of that case is from csum tree. The newly allocated tree block is also for csum tree. Then we will try to insert a tree block ref for the existing tree block ref. For a tree extent item, tree block can never be shared directly by the same tree twice. We have such BUG_ON() to prevent such problem, but this is not a proper error handling. [FIX] Replace that BUG_ON() with proper error message and leaf dump for debug build. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202829 Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Signed-off-by: David Sterba --- fs/btrfs/extent-tree.c | 20 +++++++++++++++++++- 1 file changed, 19 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 40e5b41e91cc..7b61b7b5122f 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1177,7 +1177,22 @@ int insert_inline_extent_backref(struct btrfs_trans_handle *trans, num_bytes, parent, root_objectid, owner, offset, 1); if (ret == 0) { - BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID); + /* + * We're adding refs to a tree block we already own, this + * should not happen at all. + */ + if (owner < BTRFS_FIRST_FREE_OBJECTID) { + btrfs_crit(trans->fs_info, +"adding refs to an existing tree ref, bytenr %llu num_bytes %llu root_objectid %llu", + bytenr, num_bytes, root_objectid); + if (IS_ENABLED(CONFIG_BTRFS_DEBUG)) { + WARN_ON(1); + btrfs_crit(trans->fs_info, + "path->slots[0]=%d path->nodes[0]:", path->slots[0]); + btrfs_print_leaf(path->nodes[0]); + } + return -EUCLEAN; + } update_inline_extent_backref(path, iref, refs_to_add, extent_op, NULL); } else if (ret == -ENOENT) { @@ -1397,6 +1412,9 @@ int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, /* * __btrfs_inc_extent_ref - insert backreference for a given extent * + * The counterpart is in __btrfs_free_extent(), with examples and more details + * how it works. + * * @trans: Handle of transaction * * @node: The delayed ref node used to get the bytenr/length for From d16c702fe4f274bd77b47d3ab737eadcf24e0b93 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Wed, 19 Aug 2020 14:35:50 +0800 Subject: [PATCH 166/271] btrfs: ctree: check key order before merging tree blocks [BUG] With a crafted image, btrfs can panic at btrfs_del_csums(): kernel BUG at fs/btrfs/ctree.c:3188! invalid opcode: 0000 [#1] SMP PTI CPU: 0 PID: 1156 Comm: btrfs-transacti Not tainted 5.0.0-rc8+ #9 RIP: 0010:btrfs_set_item_key_safe+0x16c/0x180 RSP: 0018:ffff976141257ab8 EFLAGS: 00010202 RAX: 0000000000000001 RBX: ffff898a6b890930 RCX: 0000000004b70000 RDX: 0000000000000000 RSI: ffff976141257bae RDI: ffff976141257acf RBP: ffff976141257b10 R08: 0000000000001000 R09: ffff9761412579a8 R10: 0000000000000000 R11: 0000000000000000 R12: ffff976141257abe R13: 0000000000000003 R14: ffff898a6a8be578 R15: ffff976141257bae FS: 0000000000000000(0000) GS:ffff898a77a00000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 CR2: 00007f779d9cd624 CR3: 000000022b2b4006 CR4: 00000000000206f0 Call Trace: truncate_one_csum+0xac/0xf0 btrfs_del_csums+0x24f/0x3a0 __btrfs_free_extent.isra.72+0x5a7/0xbe0 __btrfs_run_delayed_refs+0x539/0x1120 btrfs_run_delayed_refs+0xdb/0x1b0 btrfs_commit_transaction+0x52/0x950 ? start_transaction+0x94/0x450 transaction_kthread+0x163/0x190 kthread+0x105/0x140 ? btrfs_cleanup_transaction+0x560/0x560 ? kthread_destroy_worker+0x50/0x50 ret_from_fork+0x35/0x40 Modules linked in: ---[ end trace 93bf9db00e6c374e ]--- [CAUSE] This crafted image has a tricky key order corruption: checksum tree key (CSUM_TREE ROOT_ITEM 0) node 29741056 level 1 items 14 free 107 generation 19 owner CSUM_TREE ... key (EXTENT_CSUM EXTENT_CSUM 73785344) block 29757440 gen 19 key (EXTENT_CSUM EXTENT_CSUM 77594624) block 29753344 gen 19 ... leaf 29757440 items 5 free space 150 generation 19 owner CSUM_TREE item 0 key (EXTENT_CSUM EXTENT_CSUM 73785344) itemoff 2323 itemsize 1672 range start 73785344 end 75497472 length 1712128 item 1 key (EXTENT_CSUM EXTENT_CSUM 75497472) itemoff 2319 itemsize 4 range start 75497472 end 75501568 length 4096 item 2 key (EXTENT_CSUM EXTENT_CSUM 75501568) itemoff 579 itemsize 1740 range start 75501568 end 77283328 length 1781760 item 3 key (EXTENT_CSUM EXTENT_CSUM 77283328) itemoff 575 itemsize 4 range start 77283328 end 77287424 length 4096 item 4 key (EXTENT_CSUM EXTENT_CSUM 4120596480) itemoff 275 itemsize 300 <<< range start 4120596480 end 4120903680 length 307200 leaf 29753344 items 3 free space 1936 generation 19 owner CSUM_TREE item 0 key (18446744073457893366 EXTENT_CSUM 77594624) itemoff 2323 itemsize 1672 range start 77594624 end 79306752 length 1712128 ... Note the item 4 key of leaf 29757440, which is obviously too large, and even larger than the first key of the next leaf. However it still follows the key order in that tree block, thus tree checker is unable to detect it at read time, since tree checker can only work inside one leaf, thus such complex corruption can't be detected in advance. [FIX] The next time to detect such problem is at tree block merge time, which is in push_node_left(), balance_node_right(), push_leaf_left() or push_leaf_right(). Now we check if the key order of the right-most key of the left node is larger than the left-most key of the right node. By this we don't need to call the full tree-checker, while still keeping the key order correct as key order in each node is already checked by tree checker thus we only need to check the above two slots. Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=202833 Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 74 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index e0fd8a34efcc..be7d01055118 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3163,6 +3163,58 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, fixup_low_keys(path, &disk_key, 1); } +/* + * Check key order of two sibling extent buffers. + * + * Return true if something is wrong. + * Return false if everything is fine. + * + * Tree-checker only works inside one tree block, thus the following + * corruption can not be detected by tree-checker: + * + * Leaf @left | Leaf @right + * -------------------------------------------------------------- + * | 1 | 2 | 3 | 4 | 5 | f6 | | 7 | 8 | + * + * Key f6 in leaf @left itself is valid, but not valid when the next + * key in leaf @right is 7. + * This can only be checked at tree block merge time. + * And since tree checker has ensured all key order in each tree block + * is correct, we only need to bother the last key of @left and the first + * key of @right. + */ +static bool check_sibling_keys(struct extent_buffer *left, + struct extent_buffer *right) +{ + struct btrfs_key left_last; + struct btrfs_key right_first; + int level = btrfs_header_level(left); + int nr_left = btrfs_header_nritems(left); + int nr_right = btrfs_header_nritems(right); + + /* No key to check in one of the tree blocks */ + if (!nr_left || !nr_right) + return false; + + if (level) { + btrfs_node_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_node_key_to_cpu(right, &right_first, 0); + } else { + btrfs_item_key_to_cpu(left, &left_last, nr_left - 1); + btrfs_item_key_to_cpu(right, &right_first, 0); + } + + if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) { + btrfs_crit(left->fs_info, +"bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)", + left_last.objectid, left_last.type, + left_last.offset, right_first.objectid, + right_first.type, right_first.offset); + return true; + } + return false; +} + /* * try to push data from one node into the next node left in the * tree. @@ -3207,6 +3259,12 @@ static int push_node_left(struct btrfs_trans_handle *trans, } else push_items = min(src_nritems - 8, push_items); + /* dst is the left eb, src is the middle eb */ + if (check_sibling_keys(dst, src)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); @@ -3275,6 +3333,12 @@ static int balance_node_right(struct btrfs_trans_handle *trans, if (max_push < push_items) push_items = max_push; + /* dst is the right eb, src is the middle eb */ + if (check_sibling_keys(src, dst)) { + ret = -EUCLEAN; + btrfs_abort_transaction(trans, ret); + return ret; + } ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), @@ -3751,6 +3815,12 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (left_nritems == 0) goto out_unlock; + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + btrfs_tree_unlock(right); + free_extent_buffer(right); + return ret; + } if (path->slots[0] == left_nritems && !empty) { /* Key greater than all keys in the leaf, right neighbor has * enough room for it and we're not emptying our leaf to delete @@ -3988,6 +4058,10 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root goto out; } + if (check_sibling_keys(left, right)) { + ret = -EUCLEAN; + goto out; + } return __push_leaf_left(path, min_data_size, empty, left, free_space, right_nritems, max_slot); From f4cfa9bdd40c038ac901fbf3c57ab63e9e8eb949 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:22 +0300 Subject: [PATCH 167/271] btrfs: use RCU for quick device check in btrfs_init_new_device When adding a new device there's a mandatory check to see if a device is being duplicated to the filesystem it's added to. Since this is a read-only operations not necessary to take device_list_mutex and can simply make do with an rcu-readlock. Using just RCU is safe because there won't be another device add delete running in parallel as btrfs_init_new_device is called only from btrfs_ioctl_add_dev. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 29c6f0154c62..5449e3899e67 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2515,16 +2515,15 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path filemap_write_and_wait(bdev->bd_inode->i_mapping); - mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { + rcu_read_lock(); + list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { if (device->bdev == bdev) { ret = -EEXIST; - mutex_unlock( - &fs_devices->device_list_mutex); + rcu_read_unlock(); goto error; } } - mutex_unlock(&fs_devices->device_list_mutex); + rcu_read_unlock(); device = btrfs_alloc_device(fs_info, NULL, NULL); if (IS_ERR(device)) { From 44cab9ba374a0341e12c024f43a83e12ec4978fd Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:23 +0300 Subject: [PATCH 168/271] btrfs: refactor locked condition in btrfs_init_new_device Invert unlocked to locked and exploit the fact it can only ever be modified if we are adding a new device to a seed filesystem. This allows to simplify the check in error: label. No semantics changes. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 5449e3899e67..3e3aacaedf47 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2497,7 +2497,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path u64 orig_super_num_devices; int seeding_dev = 0; int ret = 0; - bool unlocked = false; + bool locked = false; if (sb_rdonly(sb) && !fs_devices->seeding) return -EROFS; @@ -2511,6 +2511,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path seeding_dev = 1; down_write(&sb->s_umount); mutex_lock(&uuid_mutex); + locked = true; } filemap_write_and_wait(bdev->bd_inode->i_mapping); @@ -2645,7 +2646,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path if (seeding_dev) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); - unlocked = true; + locked = false; if (ret) /* transaction commit */ return ret; @@ -2706,7 +2707,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_free_device(device); error: blkdev_put(bdev, FMODE_EXCL); - if (seeding_dev && !unlocked) { + if (locked) { mutex_unlock(&uuid_mutex); up_write(&sb->s_umount); } From 4ae312e9728fe568ce0d863421c399ec829d6df9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:24 +0300 Subject: [PATCH 169/271] btrfs: remove redundant code from btrfs_free_stale_devices Following the refactor of btrfs_free_stale_devices in 7bcb8164ad94 ("btrfs: use device_list_mutex when removing stale devices") fs_devices are freed after they have been iterated by the inner list_for_each so the use-after-free fixed by introducing the break in fd649f10c3d2 ("btrfs: Fix use-after-free when cleaning up fs_devs with a single stale device") is no longer necessary. Just remove it altogether. No functional changes. Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 3e3aacaedf47..9355f5c78c15 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -595,8 +595,6 @@ static int btrfs_free_stale_devices(const char *path, btrfs_free_device(device); ret = 0; - if (fs_devices->num_devices == 0) - break; } mutex_unlock(&fs_devices->device_list_mutex); From b9ba017fb0771f67903d70f57908899d7b0020b1 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Wed, 22 Jul 2020 11:09:25 +0300 Subject: [PATCH 170/271] btrfs: don't opencode sync_blockdev in btrfs_init_new_device Instead of opencoding filemap_write_and_wait simply call syncblockdev as it makes it abundantly clear what's going on and why this is used. No semantics changes. Reviewed-by: Johannes Thumshirn Reviewed-by: Anand Jain Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9355f5c78c15..9d169cba8514 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2512,7 +2512,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path locked = true; } - filemap_write_and_wait(bdev->bd_inode->i_mapping); + sync_blockdev(bdev); rcu_read_lock(); list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) { From 329ced799be881feab445b68163cd3869340cc08 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:00 -0400 Subject: [PATCH 171/271] btrfs: rename extent_buffer::lock_nested to extent_buffer::lock_recursed Nested locking with lockdep and everything else refers to lock hierarchy within the same lock map. This is how we indicate the same locks for different objects are ok to take in a specific order, for our use case that would be to take the lock on a leaf and then take a lock on an adjacent leaf. What ->lock_nested _actually_ refers to is if we happen to already be holding the write lock on the extent buffer and we're allowing a read lock to be taken on that extent buffer, which is recursion. Rename this so we don't get confused when we switch to a rwsem and have to start using the _nested helpers. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 2 +- fs/btrfs/extent_io.h | 2 +- fs/btrfs/locking.c | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a28d442d65b5..f971e9d689df 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4990,7 +4990,7 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, rwlock_init(&eb->lock); atomic_set(&eb->blocking_readers, 0); eb->blocking_writers = 0; - eb->lock_nested = false; + eb->lock_recursed = false; init_waitqueue_head(&eb->write_lock_wq); init_waitqueue_head(&eb->read_lock_wq); diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 30794ae58498..9e1e22f1586a 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -102,7 +102,7 @@ struct extent_buffer { int blocking_writers; atomic_t blocking_readers; - bool lock_nested; + bool lock_recursed; /* >= 0 if eb belongs to a log tree, -1 otherwise */ short log_index; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index f75612e18a82..14ceb2ce33ac 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -57,8 +57,8 @@ * performance reasons. * * - * Lock nesting - * ------------ + * Lock recursion + * -------------- * * A write operation on a tree might indirectly start a look up on the same * tree. This can happen when btrfs_cow_block locks the tree and needs to @@ -201,7 +201,7 @@ void btrfs_set_lock_blocking_read(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; btrfs_assert_tree_read_locked(eb); atomic_inc(&eb->blocking_readers); @@ -225,7 +225,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * lock, but it won't change to or away from us. If we have the write * lock, we are the owner and it'll never change. */ - if (eb->lock_nested && current->pid == eb->lock_owner) + if (eb->lock_recursed && current->pid == eb->lock_owner) return; if (eb->blocking_writers == 0) { btrfs_assert_spinning_writers_put(eb); @@ -263,8 +263,8 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) * depends on this as it may be called on a partly * (write-)locked tree. */ - BUG_ON(eb->lock_nested); - eb->lock_nested = true; + BUG_ON(eb->lock_recursed); + eb->lock_recursed = true; read_unlock(&eb->lock); trace_btrfs_tree_read_lock(eb, start_ns); return; @@ -362,11 +362,11 @@ void btrfs_tree_read_unlock(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); @@ -388,11 +388,11 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) /* * if we're nested, we have the write lock. No new locking * is needed as long as we are the lock owner. - * The write unlock will do a barrier for us, and the lock_nested + * The write unlock will do a barrier for us, and the lock_recursed * field only matters to the lock owner. */ - if (eb->lock_nested && current->pid == eb->lock_owner) { - eb->lock_nested = false; + if (eb->lock_recursed && current->pid == eb->lock_owner) { + eb->lock_recursed = false; return; } btrfs_assert_tree_read_locked(eb); From 51899412dd95b2d9f50297c527b22ada3da0000c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:01 -0400 Subject: [PATCH 172/271] btrfs: introduce btrfs_path::recurse Our current tree locking stuff allows us to recurse with read locks if we're already holding the write lock. This is necessary for the space cache inode, as we could be holding a lock on the root_tree root when we need to cache a block group, and thus need to be able to read down the root_tree to read in the inode cache. We can get away with this in our current locking, but we won't be able to with a rwsem. Handle this by purposefully annotating the places where we require recursion, so that in the future we can maybe come up with a way to avoid the recursion. In the case of the free space inode, this will be superseded by the free space tree. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 8 ++++---- fs/btrfs/ctree.h | 3 +-- fs/btrfs/inode.c | 2 ++ fs/btrfs/locking.c | 13 ++++++++++--- fs/btrfs/locking.h | 9 +++++++++ 5 files changed, 26 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index be7d01055118..7c99bf112960 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2601,7 +2601,7 @@ static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root, * We don't know the level of the root node until we actually * have it read locked */ - b = btrfs_read_lock_root_node(root); + b = __btrfs_read_lock_root_node(root, p->recurse); level = btrfs_header_level(b); if (level > write_lock_level) goto out; @@ -2875,7 +2875,7 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - btrfs_tree_read_lock(b); + __btrfs_tree_read_lock(b, p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -5453,7 +5453,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5488,7 +5488,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - btrfs_tree_read_lock(next); + __btrfs_tree_read_lock(next, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f9d4e0958e2e..1339e390a757 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -374,6 +374,7 @@ struct btrfs_path { unsigned int search_commit_root:1; unsigned int need_commit_sem:1; unsigned int skip_release_on_error:1; + unsigned int recurse:1; }; #define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \ sizeof(struct btrfs_item)) @@ -2654,8 +2655,6 @@ void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info, struct btrfs_path *path, const struct btrfs_key *new_key); struct extent_buffer *btrfs_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path, struct btrfs_key *key, int lowest_level, u64 min_trans); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 123521aa5595..1fbc7d9e5103 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -6585,6 +6585,8 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, */ path->leave_spinning = 1; + path->recurse = btrfs_is_free_space_inode(inode); + ret = btrfs_lookup_file_extent(NULL, root, path, objectid, start, 0); if (ret < 0) { goto out; diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 14ceb2ce33ac..740a0cde5731 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -244,7 +244,7 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void btrfs_tree_read_lock(struct extent_buffer *eb) +void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse) { u64 start_ns = 0; @@ -263,6 +263,7 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) * depends on this as it may be called on a partly * (write-)locked tree. */ + WARN_ON(!recurse); BUG_ON(eb->lock_recursed); eb->lock_recursed = true; read_unlock(&eb->lock); @@ -279,6 +280,11 @@ void btrfs_tree_read_lock(struct extent_buffer *eb) trace_btrfs_tree_read_lock(eb, start_ns); } +void btrfs_tree_read_lock(struct extent_buffer *eb) +{ + __btrfs_tree_read_lock(eb, false); +} + /* * Lock extent buffer for read, optimistically expecting that there are no * contending blocking writers. If there are, don't wait. @@ -552,13 +558,14 @@ struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root) * * Return: root extent buffer with read lock held */ -struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse) { struct extent_buffer *eb; while (1) { eb = btrfs_root_node(root); - btrfs_tree_read_lock(eb); + __btrfs_tree_read_lock(eb, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index d715846c10b8..31f6d6405c1d 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -21,6 +21,7 @@ struct btrfs_path; void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); +void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); @@ -29,6 +30,14 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb); int btrfs_try_tree_read_lock(struct extent_buffer *eb); int btrfs_try_tree_write_lock(struct extent_buffer *eb); int btrfs_tree_read_lock_atomic(struct extent_buffer *eb); +struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); +struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, + bool recurse); + +static inline struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root) +{ + return __btrfs_read_lock_root_node(root, false); +} #ifdef CONFIG_BTRFS_DEBUG static inline void btrfs_assert_tree_locked(struct extent_buffer *eb) { From fd7ba1c1202d15ac96d7b42256550973e12686b5 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:02 -0400 Subject: [PATCH 173/271] btrfs: add nesting tags to the locking helpers We will need these when we switch to an rwsem, so plumb in the infrastructure here to use later on. I violate the 80 character limit some here because it'll be cleaned up later. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 11 ++++++++--- fs/btrfs/locking.c | 14 ++++++++++---- fs/btrfs/locking.h | 25 ++++++++++++++++++++++++- 3 files changed, 42 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7c99bf112960..71e3fa30082c 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -2875,7 +2875,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, } else { if (!btrfs_tree_read_lock_atomic(b)) { btrfs_set_path_blocking(p); - __btrfs_tree_read_lock(b, p->recurse); + __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL, + p->recurse); } p->locks[level] = BTRFS_READ_LOCK; } @@ -5453,7 +5454,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, } if (!ret) { btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, path->recurse); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_NORMAL, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } @@ -5488,7 +5491,9 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, ret = btrfs_try_tree_read_lock(next); if (!ret) { btrfs_set_path_blocking(path); - __btrfs_tree_read_lock(next, path->recurse); + __btrfs_tree_read_lock(next, + BTRFS_NESTING_NORMAL, + path->recurse); } next_rw_lock = BTRFS_READ_LOCK; } diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 740a0cde5731..66e02ebdd340 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -244,7 +244,8 @@ void btrfs_set_lock_blocking_write(struct extent_buffer *eb) * * The rwlock is held upon exit. */ -void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse) +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse) { u64 start_ns = 0; @@ -282,7 +283,7 @@ void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse) void btrfs_tree_read_lock(struct extent_buffer *eb) { - __btrfs_tree_read_lock(eb, false); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, false); } /* @@ -415,7 +416,7 @@ void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb) * * The rwlock is held for write upon exit. */ -void btrfs_tree_lock(struct extent_buffer *eb) +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest) __acquires(&eb->lock) { u64 start_ns = 0; @@ -440,6 +441,11 @@ void btrfs_tree_lock(struct extent_buffer *eb) trace_btrfs_tree_lock(eb, start_ns); } +void btrfs_tree_lock(struct extent_buffer *eb) +{ + __btrfs_tree_lock(eb, BTRFS_NESTING_NORMAL); +} + /* * Release the write lock, either blocking or spinning (ie. there's no need * for an explicit blocking unlock, like btrfs_tree_read_unlock_blocking). @@ -565,7 +571,7 @@ struct extent_buffer *__btrfs_read_lock_root_node(struct btrfs_root *root, while (1) { eb = btrfs_root_node(root); - __btrfs_tree_read_lock(eb, recurse); + __btrfs_tree_read_lock(eb, BTRFS_NESTING_NORMAL, recurse); if (eb == root->node) break; btrfs_tree_read_unlock(eb); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 31f6d6405c1d..5fabda8f0a80 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -16,12 +16,35 @@ #define BTRFS_WRITE_LOCK_BLOCKING 3 #define BTRFS_READ_LOCK_BLOCKING 4 +/* + * We are limited in number of subclasses by MAX_LOCKDEP_SUBCLASSES, which at + * the time of this patch is 8, which is how many we use. Keep this in mind if + * you decide you want to add another subclass. + */ +enum btrfs_lock_nesting { + BTRFS_NESTING_NORMAL, + + /* + * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so + * add this in here and add a static_assert to keep us from going over + * the limit. As of this writing we're limited to 8, and we're + * definitely using 8, hence this check to keep us from messing up in + * the future. + */ + BTRFS_NESTING_MAX, +}; + +static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES, + "too many lock subclasses defined"); + struct btrfs_path; +void __btrfs_tree_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest); void btrfs_tree_lock(struct extent_buffer *eb); void btrfs_tree_unlock(struct extent_buffer *eb); -void __btrfs_tree_read_lock(struct extent_buffer *eb, bool recurse); +void __btrfs_tree_read_lock(struct extent_buffer *eb, enum btrfs_lock_nesting nest, + bool recurse); void btrfs_tree_read_lock(struct extent_buffer *eb); void btrfs_tree_read_unlock(struct extent_buffer *eb); void btrfs_tree_read_unlock_blocking(struct extent_buffer *eb); From 9631e4cc1a030aa71dea588cc9a57533b657489f Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:03 -0400 Subject: [PATCH 174/271] btrfs: introduce BTRFS_NESTING_COW for cow'ing blocks When we COW a block we are holding a lock on the original block, and then we lock the new COW block. Because our lockdep maps are based on root + level, this will make lockdep complain. We need a way to indicate a subclass for locking the COW'ed block, so plumb through our btrfs_lock_nesting from btrfs_cow_block down to the btrfs_init_buffer, and then introduce BTRFS_NESTING_COW to be used for cow'ing blocks. The reason I've added all this extra infrastructure is because there will be need of different nesting classes in follow up patches. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 52 ++++++++++++++++++++++++++---------------- fs/btrfs/ctree.h | 6 +++-- fs/btrfs/disk-io.c | 5 ++-- fs/btrfs/extent-tree.c | 12 ++++++---- fs/btrfs/ioctl.c | 3 ++- fs/btrfs/locking.h | 8 +++++++ fs/btrfs/relocation.c | 11 +++++---- fs/btrfs/transaction.c | 5 ++-- 8 files changed, 66 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 71e3fa30082c..48178fd8740f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,7 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0); + &disk_key, level, buf->start, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -957,7 +957,8 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( const struct btrfs_disk_key *disk_key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *ret; @@ -986,7 +987,7 @@ static struct extent_buffer *alloc_tree_block_no_bg_flush( ret = btrfs_alloc_tree_block(trans, root, parent_start, root->root_key.objectid, disk_key, level, - hint, empty_size); + hint, empty_size, nest); trans->can_flush_pending_bgs = true; return ret; @@ -1009,7 +1010,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, struct extent_buffer **cow_ret, - u64 search_start, u64 empty_size) + u64 search_start, u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_disk_key disk_key; @@ -1040,7 +1042,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = parent->start; cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key, - level, search_start, empty_size); + level, search_start, empty_size, nest); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -1446,7 +1448,8 @@ static inline int should_cow_block(struct btrfs_trans_handle *trans, noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret) + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; u64 search_start; @@ -1485,7 +1488,7 @@ noinline int btrfs_cow_block(struct btrfs_trans_handle *trans, */ btrfs_qgroup_trace_subtree_after_cow(trans, root, buf); ret = __btrfs_cow_block(trans, root, buf, parent, - parent_slot, cow_ret, search_start, 0); + parent_slot, cow_ret, search_start, 0, nest); trace_btrfs_cow_block(root, buf, *cow_ret); @@ -1657,7 +1660,8 @@ int btrfs_realloc_node(struct btrfs_trans_handle *trans, err = __btrfs_cow_block(trans, root, cur, parent, i, &cur, search_start, min(16 * blocksize, - (end_slot - i) * blocksize)); + (end_slot - i) * blocksize), + BTRFS_NESTING_COW); if (err) { btrfs_tree_unlock(cur); free_extent_buffer(cur); @@ -1855,7 +1859,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(child); btrfs_set_lock_blocking_write(child); - ret = btrfs_cow_block(trans, root, child, mid, 0, &child); + ret = btrfs_cow_block(trans, root, child, mid, 0, &child, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(child); free_extent_buffer(child); @@ -1894,7 +1899,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(left); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, - parent, pslot - 1, &left); + parent, pslot - 1, &left, + BTRFS_NESTING_COW); if (wret) { ret = wret; goto enospc; @@ -1909,7 +1915,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_tree_lock(right); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, - parent, pslot + 1, &right); + parent, pslot + 1, &right, + BTRFS_NESTING_COW); if (wret) { ret = wret; goto enospc; @@ -2077,7 +2084,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, wret = 1; } else { ret = btrfs_cow_block(trans, root, left, parent, - pslot - 1, &left); + pslot - 1, &left, + BTRFS_NESTING_COW); if (ret) wret = 1; else { @@ -2132,7 +2140,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right); + &right, BTRFS_NESTING_COW); if (ret) wret = 1; else { @@ -2740,11 +2748,13 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root, btrfs_set_path_blocking(p); if (last_level) err = btrfs_cow_block(trans, root, b, NULL, 0, - &b); + &b, + BTRFS_NESTING_COW); else err = btrfs_cow_block(trans, root, b, p->nodes[level + 1], - p->slots[level + 1], &b); + p->slots[level + 1], &b, + BTRFS_NESTING_COW); if (err) { ret = err; goto done; @@ -3396,7 +3406,8 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_node_key(lower, &lower_key, 0); c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, - root->node->start, 0); + root->node->start, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(c)) return PTR_ERR(c); @@ -3526,7 +3537,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0); + c->start, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(split)) return PTR_ERR(split); @@ -3804,7 +3815,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right); + slot + 1, &right, BTRFS_NESTING_COW); if (ret) goto out_unlock; @@ -4045,7 +4056,8 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, - path->nodes[1], slot - 1, &left); + path->nodes[1], slot - 1, &left, + BTRFS_NESTING_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) @@ -4312,7 +4324,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, btrfs_item_key(l, &disk_key, mid); right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0); + l->start, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1339e390a757..b4d3a4d82053 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2526,7 +2526,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size); + u64 empty_size, + enum btrfs_lock_nesting nest); void btrfs_free_tree_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, @@ -2667,7 +2668,8 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent, int btrfs_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, struct extent_buffer *parent, int parent_slot, - struct extent_buffer **cow_ret); + struct extent_buffer **cow_ret, + enum btrfs_lock_nesting nest); int btrfs_copy_root(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4eb56f00ee81..ed887242f348 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1205,7 +1205,8 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, root->root_key.type = BTRFS_ROOT_ITEM_KEY; root->root_key.offset = 0; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); leaf = NULL; @@ -1277,7 +1278,7 @@ static struct btrfs_root *alloc_log_tree(struct btrfs_trans_handle *trans, */ leaf = btrfs_alloc_tree_block(trans, root, 0, BTRFS_TREE_LOG_OBJECTID, - NULL, 0, 0, 0); + NULL, 0, 0, 0, BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { btrfs_put_root(root); return ERR_CAST(leaf); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7b61b7b5122f..3b21fee13e77 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4656,7 +4656,8 @@ int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans, static struct extent_buffer * btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, - u64 bytenr, int level, u64 owner) + u64 bytenr, int level, u64 owner, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *buf; @@ -4679,7 +4680,7 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, } btrfs_set_buffer_lockdep_class(owner, buf, level); - btrfs_tree_lock(buf); + __btrfs_tree_lock(buf, nest); btrfs_clean_tree_block(buf); clear_bit(EXTENT_BUFFER_STALE, &buf->bflags); @@ -4725,7 +4726,8 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, u64 parent, u64 root_objectid, const struct btrfs_disk_key *key, int level, u64 hint, - u64 empty_size) + u64 empty_size, + enum btrfs_lock_nesting nest) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key ins; @@ -4741,7 +4743,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (btrfs_is_testing(fs_info)) { buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr, - level, root_objectid); + level, root_objectid, nest); if (!IS_ERR(buf)) root->alloc_bytenr += blocksize; return buf; @@ -4758,7 +4760,7 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans, goto out_unuse; buf = btrfs_init_new_buffer(trans, root, ins.objectid, level, - root_objectid); + root_objectid, nest); if (IS_ERR(buf)) { ret = PTR_ERR(buf); goto out_free_reserved; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 3779a6c12184..b91444e810a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -628,7 +628,8 @@ static noinline int create_subvol(struct inode *dir, if (ret) goto fail; - leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0); + leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0, + BTRFS_NESTING_NORMAL); if (IS_ERR(leaf)) { ret = PTR_ERR(leaf); goto fail; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5fabda8f0a80..8b47ba34fb03 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -24,6 +24,14 @@ enum btrfs_lock_nesting { BTRFS_NESTING_NORMAL, + /* + * When we COW a block we are holding the lock on the original block, + * and since our lockdep maps are rootid+level, this confuses lockdep + * when we lock the newly allocated COW'd block. Handle this by having + * a subclass for COW'ed blocks so that lockdep doesn't complain. + */ + BTRFS_NESTING_COW, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 4ba1ab9cc76d..3602806d71bd 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -1206,7 +1206,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, } if (cow) { - ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb); + ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1274,7 +1275,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, btrfs_tree_lock(eb); if (cow) { ret = btrfs_cow_block(trans, dest, eb, parent, - slot, &eb); + slot, &eb, + BTRFS_NESTING_COW); BUG_ON(ret); } btrfs_set_lock_blocking_write(eb); @@ -1781,7 +1783,8 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, * relocated and the block is tree root. */ leaf = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf); + ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf, + BTRFS_NESTING_COW); btrfs_tree_unlock(leaf); free_extent_buffer(leaf); if (ret < 0) @@ -2308,7 +2311,7 @@ static int do_relocation(struct btrfs_trans_handle *trans, if (!node->eb) { ret = btrfs_cow_block(trans, root, eb, upper->eb, - slot, &eb); + slot, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); if (ret < 0) { diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f8265dbec9c3..52ada47aff50 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1184,7 +1184,7 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) eb = btrfs_lock_root_node(fs_info->tree_root); ret = btrfs_cow_block(trans, fs_info->tree_root, eb, NULL, - 0, &eb); + 0, &eb, BTRFS_NESTING_COW); btrfs_tree_unlock(eb); free_extent_buffer(eb); @@ -1589,7 +1589,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, btrfs_set_root_otransid(new_root_item, trans->transid); old = btrfs_lock_root_node(root); - ret = btrfs_cow_block(trans, root, old, NULL, 0, &old); + ret = btrfs_cow_block(trans, root, old, NULL, 0, &old, + BTRFS_NESTING_COW); if (ret) { btrfs_tree_unlock(old); free_extent_buffer(old); From bf77467a93bdf11d815f5c5f25206a0d058da8de Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:04 -0400 Subject: [PATCH 175/271] btrfs: introduce BTRFS_NESTING_LEFT/BTRFS_NESTING_RIGHT Our lockdep maps are based on rootid+level, however in some cases we will lock adjacent blocks on the same level, namely in searching forward or in split/balance. Because of this lockdep will complain, so we need a separate subclass to indicate to lockdep that these are different locks. lock leaf -> BTRFS_NESTING_NORMAL cow leaf -> BTRFS_NESTING_COW split leaf lock left -> BTRFS_NESTING_LEFT lock right -> BTRFS_NESTING_RIGHT The above graph illustrates the need for this new nesting subclass. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 16 ++++++++-------- fs/btrfs/locking.h | 12 ++++++++++++ 2 files changed, 20 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 48178fd8740f..94af1f385ea6 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1896,7 +1896,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, left = NULL; if (left) { - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, @@ -1912,7 +1912,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, right = NULL; if (right) { - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, @@ -2076,7 +2076,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (left) { u32 left_nr; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); left_nr = btrfs_header_nritems(left); @@ -2131,7 +2131,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, if (right) { u32 right_nr; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); right_nr = btrfs_header_nritems(right); @@ -3806,7 +3806,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(right)) return 1; - btrfs_tree_lock(right); + __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT); btrfs_set_lock_blocking_write(right); free_space = btrfs_leaf_free_space(right); @@ -4045,7 +4045,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root if (IS_ERR(left)) return 1; - btrfs_tree_lock(left); + __btrfs_tree_lock(left, BTRFS_NESTING_LEFT); btrfs_set_lock_blocking_write(left); free_space = btrfs_leaf_free_space(left); @@ -5467,7 +5467,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!ret) { btrfs_set_path_blocking(path); __btrfs_tree_read_lock(next, - BTRFS_NESTING_NORMAL, + BTRFS_NESTING_RIGHT, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; @@ -5504,7 +5504,7 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path, if (!ret) { btrfs_set_path_blocking(path); __btrfs_tree_read_lock(next, - BTRFS_NESTING_NORMAL, + BTRFS_NESTING_RIGHT, path->recurse); } next_rw_lock = BTRFS_READ_LOCK; diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 8b47ba34fb03..5844bc1c8410 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -32,6 +32,18 @@ enum btrfs_lock_nesting { */ BTRFS_NESTING_COW, + /* + * Oftentimes we need to lock adjacent nodes on the same level while + * still holding the lock on the original node we searched to, such as + * for searching forward or for split/balance. + * + * Because of this we need to indicate to lockdep that this is + * acceptable by having a different subclass for each of these + * operations. + */ + BTRFS_NESTING_LEFT, + BTRFS_NESTING_RIGHT, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From bf59a5a21604ef79da9105bef1bae817fd053e75 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:05 -0400 Subject: [PATCH 176/271] btrfs: introduce BTRFS_NESTING_LEFT/RIGHT_COW For similar reasons as BTRFS_NESTING_COW, we need BTRFS_NESTING_LEFT/RIGHT_COW. The pattern is this lock leaf -> BTRFS_NESTING_NORMAL cow leaf -> BTRFS_NESTING_COW split leaf lock left -> BTRFS_NESTING_LEFT cow left -> BTRFS_NESTING_LEFT_COW We need this in order to indicate to lockdep that these locks are discrete and are being taken in a safe order. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 ++++++------ fs/btrfs/locking.h | 8 ++++++++ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 94af1f385ea6..ecf8bf240d73 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1900,7 +1900,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking_write(left); wret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, - BTRFS_NESTING_COW); + BTRFS_NESTING_LEFT_COW); if (wret) { ret = wret; goto enospc; @@ -1916,7 +1916,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, btrfs_set_lock_blocking_write(right); wret = btrfs_cow_block(trans, root, right, parent, pslot + 1, &right, - BTRFS_NESTING_COW); + BTRFS_NESTING_RIGHT_COW); if (wret) { ret = wret; goto enospc; @@ -2085,7 +2085,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, left, parent, pslot - 1, &left, - BTRFS_NESTING_COW); + BTRFS_NESTING_LEFT_COW); if (ret) wret = 1; else { @@ -2140,7 +2140,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, } else { ret = btrfs_cow_block(trans, root, right, parent, pslot + 1, - &right, BTRFS_NESTING_COW); + &right, BTRFS_NESTING_RIGHT_COW); if (ret) wret = 1; else { @@ -3815,7 +3815,7 @@ static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, right, upper, - slot + 1, &right, BTRFS_NESTING_COW); + slot + 1, &right, BTRFS_NESTING_RIGHT_COW); if (ret) goto out_unlock; @@ -4057,7 +4057,7 @@ static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root /* cow and double check */ ret = btrfs_cow_block(trans, root, left, path->nodes[1], slot - 1, &left, - BTRFS_NESTING_COW); + BTRFS_NESTING_LEFT_COW); if (ret) { /* we hit -ENOSPC, but it isn't fatal here */ if (ret == -ENOSPC) diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 5844bc1c8410..16563bc486c7 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -44,6 +44,14 @@ enum btrfs_lock_nesting { BTRFS_NESTING_LEFT, BTRFS_NESTING_RIGHT, + /* + * When splitting we will be holding a lock on the left/right node when + * we need to cow that node, thus we need a new set of subclasses for + * these two operations. + */ + BTRFS_NESTING_LEFT_COW, + BTRFS_NESTING_RIGHT_COW, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From 4dff97e69005ea90266f3e3dda295264e854c15d Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:06 -0400 Subject: [PATCH 177/271] btrfs: introduce BTRFS_NESTING_SPLIT for split blocks If we are splitting a leaf/node, we could do something like the following lock(leaf) BTRFS_NESTING_NORMAL lock(left) BTRFS_NESTING_LEFT + BTRFS_NESTING_COW push from leaf -> left reset path to point to left split left allocate new block, lock block BTRFS_NESTING_SPLIT at the new block point we need to have a different nesting level, because we have already used either BTRFS_NESTING_LEFT or BTRFS_NESTING_RIGHT when pushing items from the original leaf into the adjacent leaves. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 4 ++-- fs/btrfs/locking.h | 9 +++++++++ 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index ecf8bf240d73..884c8b5a0e62 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -3537,7 +3537,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, btrfs_node_key(c, &disk_key, mid); split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level, - c->start, 0, BTRFS_NESTING_NORMAL); + c->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(split)) return PTR_ERR(split); @@ -4324,7 +4324,7 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, btrfs_item_key(l, &disk_key, mid); right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0, BTRFS_NESTING_NORMAL); + l->start, 0, BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index 16563bc486c7..a6b59808e046 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -52,6 +52,15 @@ enum btrfs_lock_nesting { BTRFS_NESTING_LEFT_COW, BTRFS_NESTING_RIGHT_COW, + /* + * When splitting we may push nodes to the left or right, but still use + * the subsequent nodes in our path, keeping our locks on those adjacent + * blocks. Thus when we go to allocate a new split block we've already + * used up all of our available subclasses, so this subclass exists to + * handle this case where we need to allocate a new split block. + */ + BTRFS_NESTING_SPLIT, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From cf6f34aa3ada0be8c5f90fe93f48a75fea082c51 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:07 -0400 Subject: [PATCH 178/271] btrfs: introduce BTRFS_NESTING_NEW_ROOT for adding new roots The way we add new roots is confusing from a locking perspective for lockdep. We generally have the rule that we lock things in order from highest level to lowest, but in the case of adding a new level to the tree we actually allocate a new block for the root, which makes the locking go in reverse. A similar issue exists for snapshotting, we cow the original root for the root of a new tree, however they're at the same level. Address this by using BTRFS_NESTING_NEW_ROOT for these operations. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 5 +++-- fs/btrfs/locking.h | 9 +++++++++ 2 files changed, 12 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 884c8b5a0e62..d61ea238ad8a 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -198,7 +198,8 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, btrfs_node_key(buf, &disk_key, 0); cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid, - &disk_key, level, buf->start, 0, BTRFS_NESTING_NORMAL); + &disk_key, level, buf->start, 0, + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(cow)) return PTR_ERR(cow); @@ -3407,7 +3408,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level, root->node->start, 0, - BTRFS_NESTING_NORMAL); + BTRFS_NESTING_NEW_ROOT); if (IS_ERR(c)) return PTR_ERR(c); diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index a6b59808e046..3ea81ed3320b 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -61,6 +61,15 @@ enum btrfs_lock_nesting { */ BTRFS_NESTING_SPLIT, + /* + * When promoting a new block to a root we need to have a special + * subclass so we don't confuse lockdep, as it will appear that we are + * locking a higher level node before a lower level one. Copying also + * has this problem as it appears we're locking the same block again + * when we make a snapshot of an existing root. + */ + BTRFS_NESTING_NEW_ROOT, + /* * We are limited to MAX_LOCKDEP_SUBLCLASSES number of subclasses, so * add this in here and add a static_assert to keep us from going over From ca9d473a3e300bcddc73c00fdf2f4bf6ca43c4a2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 20 Aug 2020 11:46:08 -0400 Subject: [PATCH 179/271] btrfs: use BTRFS_NESTED_NEW_ROOT for double splits I've made this change separate since it requires both of the newly added NESTED flags and I didn't want to slip it into one of those changes. If we do a double split of a node we can end up doing a BTRFS_NESTED_SPLIT on level 0, which throws lockdep off because it appears as a double lock. Since we're maxed out on subclasses, use BTRFS_NESTED_NEW_ROOT if we had to do a double split. This is OK because we won't have to do a double split if we had to insert a new root, and the new root would be at a higher level anyway. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index d61ea238ad8a..7dbfa365eb18 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4324,8 +4324,18 @@ static noinline int split_leaf(struct btrfs_trans_handle *trans, else btrfs_item_key(l, &disk_key, mid); + /* + * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double + * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES + * subclasses, which is 8 at the time of this patch, and we've maxed it + * out. In the future we could add a + * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just + * use BTRFS_NESTING_NEW_ROOT. + */ right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0, - l->start, 0, BTRFS_NESTING_SPLIT); + l->start, 0, num_doubles ? + BTRFS_NESTING_NEW_ROOT : + BTRFS_NESTING_SPLIT); if (IS_ERR(right)) return PTR_ERR(right); From b79b724969ad5e5a01548c75f30fc14c893e825b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:38 +0300 Subject: [PATCH 180/271] btrfs: make inode_tree_del take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 1fbc7d9e5103..2306fbaafb28 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -5305,15 +5305,15 @@ static void inode_tree_add(struct inode *inode) spin_unlock(&root->inode_lock); } -static void inode_tree_del(struct inode *inode) +static void inode_tree_del(struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; int empty = 0; spin_lock(&root->inode_lock); - if (!RB_EMPTY_NODE(&BTRFS_I(inode)->rb_node)) { - rb_erase(&BTRFS_I(inode)->rb_node, &root->inode_tree); - RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); + if (!RB_EMPTY_NODE(&inode->rb_node)) { + rb_erase(&inode->rb_node, &root->inode_tree); + RB_CLEAR_NODE(&inode->rb_node); empty = RB_EMPTY_ROOT(&root->inode_tree); } spin_unlock(&root->inode_lock); @@ -8685,7 +8685,7 @@ void btrfs_destroy_inode(struct inode *inode) } } btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); - inode_tree_del(inode); + inode_tree_del(BTRFS_I(inode)); btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); btrfs_put_root(BTRFS_I(inode)->root); From 6d072c8e291f4127a7a40addb7ef553f9490a600 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:39 +0300 Subject: [PATCH 181/271] btrfs: make btrfs_lookup_first_ordered_extent take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 6 ++++-- fs/btrfs/inode.c | 3 ++- fs/btrfs/ordered-data.c | 6 +++--- fs/btrfs/ordered-data.h | 2 +- 4 files changed, 10 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 4bf15846efdc..7c37a43cc2e3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2541,7 +2541,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, lockend); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + lockend); /* * We need to make sure we have no ordered extents in this range @@ -3400,7 +3401,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); - ordered = btrfs_lookup_first_ordered_extent(inode, locked_end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + locked_end); if (ordered && ordered->file_offset + ordered->num_bytes > alloc_start && diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 2306fbaafb28..93063edc2d7b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8672,7 +8672,8 @@ void btrfs_destroy_inode(struct inode *inode) return; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), + (u64)-1); if (!ordered) break; else { diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4732c5b89460..5a7e9c5872d3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -710,7 +710,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) end = orig_end; while (1) { - ordered = btrfs_lookup_first_ordered_extent(inode, end); + ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), end); if (!ordered) break; if (ordered->file_offset > orig_end) { @@ -838,13 +838,13 @@ void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, * if none is found */ struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode *inode, u64 file_offset) +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) { struct btrfs_ordered_inode_tree *tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); node = tree_search(tree, file_offset); if (!node) diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 644258a7dfb1..b287a2a403e6 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -178,7 +178,7 @@ void btrfs_start_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * -btrfs_lookup_first_ordered_extent(struct inode * inode, u64 file_offset); +btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); struct btrfs_ordered_extent *btrfs_lookup_ordered_range( struct btrfs_inode *inode, u64 file_offset, From acbf1dd0fcbd10c67826a19958f55a053b32f532 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:40 +0300 Subject: [PATCH 182/271] btrfs: make ordered extent tracepoint take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ordered-data.c | 8 ++++---- include/trace/events/btrfs.h | 17 ++++++++--------- 2 files changed, 12 insertions(+), 13 deletions(-) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 5a7e9c5872d3..369ddad30d9c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -217,7 +217,7 @@ static int __btrfs_add_ordered_extent(struct btrfs_inode *inode, u64 file_offset INIT_LIST_HEAD(&entry->work_list); init_completion(&entry->completion); - trace_btrfs_ordered_extent_add(&inode->vfs_inode, entry); + trace_btrfs_ordered_extent_add(inode, entry); spin_lock_irq(&tree->lock); node = tree_insert(&tree->tree, file_offset, @@ -442,7 +442,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) struct list_head *cur; struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); + trace_btrfs_ordered_extent_put(BTRFS_I(entry->inode), entry); if (refcount_dec_and_test(&entry->refs)) { ASSERT(list_empty(&entry->root_extent_list)); @@ -528,7 +528,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(inode, entry); + trace_btrfs_ordered_extent_remove(BTRFS_I(inode), entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); @@ -658,7 +658,7 @@ void btrfs_start_ordered_extent(struct inode *inode, u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; - trace_btrfs_ordered_extent_start(inode, entry); + trace_btrfs_ordered_extent_start(BTRFS_I(inode), entry); /* * pages in the range can be dirty, clean or writeback. We diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index b9241836d4f7..8d311062d376 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -510,7 +510,7 @@ DEFINE_EVENT( DECLARE_EVENT_CLASS(btrfs__ordered_extent, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered), @@ -529,8 +529,8 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __field( u64, truncated_len ) ), - TP_fast_assign_btrfs(btrfs_sb(inode->i_sb), - __entry->ino = btrfs_ino(BTRFS_I(inode)); + TP_fast_assign_btrfs(inode->root->fs_info, + __entry->ino = btrfs_ino(inode); __entry->file_offset = ordered->file_offset; __entry->start = ordered->disk_bytenr; __entry->len = ordered->num_bytes; @@ -539,8 +539,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, __entry->flags = ordered->flags; __entry->compress_type = ordered->compress_type; __entry->refs = refcount_read(&ordered->refs); - __entry->root_objectid = - BTRFS_I(inode)->root->root_key.objectid; + __entry->root_objectid = inode->root->root_key.objectid; __entry->truncated_len = ordered->truncated_len; ), @@ -563,7 +562,7 @@ DECLARE_EVENT_CLASS(btrfs__ordered_extent, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -571,7 +570,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_add, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -579,7 +578,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_remove, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) @@ -587,7 +586,7 @@ DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_start, DEFINE_EVENT(btrfs__ordered_extent, btrfs_ordered_extent_put, - TP_PROTO(const struct inode *inode, + TP_PROTO(const struct btrfs_inode *inode, const struct btrfs_ordered_extent *ordered), TP_ARGS(inode, ordered) From 90c0304c6307a68e1e4af637d56d6613ed026875 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:41 +0300 Subject: [PATCH 183/271] btrfs: make btrfs_dec_test_ordered_pending take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ordered-data.c | 7 +++---- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 8 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 93063edc2d7b..8b50cad7ee8f 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2781,8 +2781,8 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, - end - start + 1, uptodate)) + if (!btrfs_dec_test_ordered_pending(BTRFS_I(inode), &ordered_extent, + start, end - start + 1, uptodate)) return; if (btrfs_is_free_space_inode(BTRFS_I(inode))) @@ -8184,8 +8184,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, ordered->truncated_len = new_len; spin_unlock_irq(&tree->lock); - if (btrfs_dec_test_ordered_pending(inode, &ordered, - start, + if (btrfs_dec_test_ordered_pending(BTRFS_I(inode), + &ordered, start, end - start + 1, 1)) btrfs_finish_ordered_io(ordered); } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 369ddad30d9c..168a5edd939d 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -378,17 +378,16 @@ int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, * test_and_set_bit on a flag in the struct btrfs_ordered_extent is used * to make sure this function only returns 1 once for a given ordered extent. */ -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate) { - struct btrfs_ordered_inode_tree *tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; struct rb_node *node; struct btrfs_ordered_extent *entry = NULL; unsigned long flags; int ret; - tree = &BTRFS_I(inode)->ordered_tree; spin_lock_irqsave(&tree->lock, flags); if (cached && *cached) { entry = *cached; @@ -409,7 +408,7 @@ int btrfs_dec_test_ordered_pending(struct inode *inode, } if (io_size > entry->bytes_left) { - btrfs_crit(BTRFS_I(inode)->root->fs_info, + btrfs_crit(inode->root->fs_info, "bad ordered accounting left %llu size %llu", entry->bytes_left, io_size); } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index b287a2a403e6..1610195c6097 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -153,7 +153,7 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); void btrfs_remove_ordered_extent(struct inode *inode, struct btrfs_ordered_extent *entry); -int btrfs_dec_test_ordered_pending(struct inode *inode, +int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, u64 file_offset, u64 io_size, int uptodate); int btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, From 6fee248d2beb3591f6f50aeeac843b366b116e3b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:42 +0300 Subject: [PATCH 184/271] btrfs: convert btrfs_inode_sectorsize to take btrfs_inode It's counterintuitive to have a function named btrfs_inode_xxx which takes a generic inode. Also move the function to btrfs_inode.h so that it has access to the definition of struct btrfs_inode. Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 5 +++++ fs/btrfs/ctree.h | 4 ---- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/file.c | 10 +++++----- fs/btrfs/ordered-data.c | 2 +- fs/btrfs/reflink.c | 2 +- 6 files changed, 15 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 00f7831d0902..6fdb46d58299 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -217,6 +217,11 @@ struct btrfs_inode { struct inode vfs_inode; }; +static inline u32 btrfs_inode_sectorsize(const struct btrfs_inode *inode) +{ + return inode->root->fs_info->sectorsize; +} + static inline struct btrfs_inode *BTRFS_I(const struct inode *inode) { return container_of(inode, struct btrfs_inode, vfs_inode); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index b4d3a4d82053..af2bd059daae 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1197,10 +1197,6 @@ struct btrfs_file_private { void *filldir_buf; }; -static inline u32 btrfs_inode_sectorsize(const struct inode *inode) -{ - return btrfs_sb(inode->i_sb)->sectorsize; -} static inline u32 BTRFS_LEAF_DATA_SIZE(const struct btrfs_fs_info *info) { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f971e9d689df..94ec2455de99 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4555,7 +4555,7 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) static struct extent_map *get_extent_skip_holes(struct inode *inode, u64 offset, u64 last) { - u64 sectorsize = btrfs_inode_sectorsize(inode); + u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); struct extent_map *em; u64 len; @@ -4736,8 +4736,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } - start = round_down(start, btrfs_inode_sectorsize(inode)); - len = round_up(max, btrfs_inode_sectorsize(inode)) - start; + start = round_down(start, btrfs_inode_sectorsize(BTRFS_I(inode))); + len = round_up(max, btrfs_inode_sectorsize(BTRFS_I(inode))) - start; /* * lookup the last file extent. We're not using i_size here diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7c37a43cc2e3..30c511d1e70b 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2901,9 +2901,9 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out_only_mutex; } - lockstart = round_up(offset, btrfs_inode_sectorsize(inode)); + lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode))); lockend = round_down(offset + len, - btrfs_inode_sectorsize(inode)) - 1; + btrfs_inode_sectorsize(BTRFS_I(inode))) - 1; same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* @@ -3108,7 +3108,7 @@ enum { static int btrfs_zero_range_check_range_boundary(struct inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); struct extent_map *em; int ret; @@ -3138,7 +3138,7 @@ static int btrfs_zero_range(struct inode *inode, struct extent_changeset *data_reserved = NULL; int ret; u64 alloc_hint = 0; - const u64 sectorsize = btrfs_inode_sectorsize(inode); + const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -3319,7 +3319,7 @@ static long btrfs_fallocate(struct file *file, int mode, u64 locked_end; u64 actual_end = 0; struct extent_map *em; - int blocksize = btrfs_inode_sectorsize(inode); + int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode)); int ret; alloc_start = round_down(offset, blocksize); diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 168a5edd939d..d39a0fe4c463 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -870,7 +870,7 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; unsigned long num_sectors; unsigned long i; - u32 sectorsize = btrfs_inode_sectorsize(inode); + u32 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 5cd02514cf4d..7126f94cf216 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -52,7 +52,7 @@ static int copy_inline_to_page(struct inode *inode, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(inode); + const u64 block_size = btrfs_inode_sectorsize(BTRFS_I(inode)); const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); From 53ac7ead2446947facccbda392bbf499be435a59 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:43 +0300 Subject: [PATCH 185/271] btrfs: make btrfs_invalidatepage work on btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 18 ++++++++---------- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 8b50cad7ee8f..301bbc41e963 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8124,15 +8124,15 @@ static int btrfs_migratepage(struct address_space *mapping, static void btrfs_invalidatepage(struct page *page, unsigned int offset, unsigned int length) { - struct inode *inode = page->mapping->host; - struct extent_io_tree *tree; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct extent_io_tree *tree = &inode->io_tree; struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; u64 page_start = page_offset(page); u64 page_end = page_start + PAGE_SIZE - 1; u64 start; u64 end; - int inode_evicting = inode->i_state & I_FREEING; + int inode_evicting = inode->vfs_inode.i_state & I_FREEING; /* * we have the page locked, so new writeback can't start, @@ -8143,7 +8143,6 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, */ wait_on_page_writeback(page); - tree = &BTRFS_I(inode)->io_tree; if (offset) { btrfs_releasepage(page, GFP_NOFS); return; @@ -8153,8 +8152,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, lock_extent_bits(tree, page_start, page_end, &cached_state); again: start = page_start; - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), start, - page_end - start + 1); + ordered = btrfs_lookup_ordered_range(inode, start, page_end - start + 1); if (ordered) { end = min(page_end, ordered->file_offset + ordered->num_bytes - 1); @@ -8175,7 +8173,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, struct btrfs_ordered_inode_tree *tree; u64 new_len; - tree = &BTRFS_I(inode)->ordered_tree; + tree = &inode->ordered_tree; spin_lock_irq(&tree->lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -8184,8 +8182,8 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, ordered->truncated_len = new_len; spin_unlock_irq(&tree->lock); - if (btrfs_dec_test_ordered_pending(BTRFS_I(inode), - &ordered, start, + if (btrfs_dec_test_ordered_pending(inode, &ordered, + start, end - start + 1, 1)) btrfs_finish_ordered_io(ordered); } @@ -8214,7 +8212,7 @@ static void btrfs_invalidatepage(struct page *page, unsigned int offset, * bit of its io_tree, and free the qgroup reserved data space. * Since the IO will never happen for this page. */ - btrfs_qgroup_free_data(BTRFS_I(inode), NULL, page_start, PAGE_SIZE); + btrfs_qgroup_free_data(inode, NULL, page_start, PAGE_SIZE); if (!inode_evicting) { clear_extent_bit(tree, page_start, page_end, EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | From 3347c48f276754eaf8f41a6e0368ed558a9453be Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:44 +0300 Subject: [PATCH 186/271] btrfs: make btrfs_writepage_endio_finish_ordered btrfs_inode-centric Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 301bbc41e963..6201bbb4ca90 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2773,19 +2773,19 @@ static void finish_ordered_fn(struct btrfs_work *work) void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, u64 end, int uptodate) { - struct inode *inode = page->mapping->host; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_extent *ordered_extent = NULL; struct btrfs_workqueue *wq; trace_btrfs_writepage_end_io_hook(page, start, end, uptodate); ClearPagePrivate2(page); - if (!btrfs_dec_test_ordered_pending(BTRFS_I(inode), &ordered_extent, - start, end - start + 1, uptodate)) + if (!btrfs_dec_test_ordered_pending(inode, &ordered_extent, start, + end - start + 1, uptodate)) return; - if (btrfs_is_free_space_inode(BTRFS_I(inode))) + if (btrfs_is_free_space_inode(inode)) wq = fs_info->endio_freespace_worker; else wq = fs_info->endio_write_workers; From f1bbde8d5f278050e63b7330f6d6b230fe1ea398 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:45 +0300 Subject: [PATCH 187/271] btrfs: make get_extent_skip_holes take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 94ec2455de99..5206ae72594b 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4552,10 +4552,10 @@ int try_release_extent_mapping(struct page *page, gfp_t mask) * helper function for fiemap, which doesn't want to see any holes. * This maps until we find something past 'last' */ -static struct extent_map *get_extent_skip_holes(struct inode *inode, +static struct extent_map *get_extent_skip_holes(struct btrfs_inode *inode, u64 offset, u64 last) { - u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; u64 len; @@ -4567,7 +4567,7 @@ static struct extent_map *get_extent_skip_holes(struct inode *inode, if (len == 0) break; len = ALIGN(len, sectorsize); - em = btrfs_get_extent_fiemap(BTRFS_I(inode), offset, len); + em = btrfs_get_extent_fiemap(inode, offset, len); if (IS_ERR_OR_NULL(em)) return em; @@ -4787,7 +4787,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(inode, start, last_for_get_extent); + em = get_extent_skip_holes(BTRFS_I(inode), start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4876,7 +4876,8 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(inode, off, last_for_get_extent); + em = get_extent_skip_holes(BTRFS_I(inode), off, + last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; From 3c5641a83ac4d2e8ef3d542b35794962e51d7135 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:46 +0300 Subject: [PATCH 188/271] btrfs: make btrfs_find_ordered_sum take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file-item.c | 4 ++-- fs/btrfs/ordered-data.c | 19 +++++++++---------- fs/btrfs/ordered-data.h | 4 ++-- 3 files changed, 13 insertions(+), 14 deletions(-) diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 7d5ec71615b8..8f4f2bd6d9b9 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -318,8 +318,8 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, if (page_offsets) offset = page_offset(bvec.bv_page) + bvec.bv_offset; - count = btrfs_find_ordered_sum(inode, offset, disk_bytenr, - csum, nblocks); + count = btrfs_find_ordered_sum(BTRFS_I(inode), offset, + disk_bytenr, csum, nblocks); if (count) goto found; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index d39a0fe4c463..c8a13d143877 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -861,20 +861,21 @@ btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset) * try to find a checksum. This is used because we allow pages to * be reclaimed before their checksum is actually put into the btree */ -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len) +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_ordered_sum *ordered_sum; struct btrfs_ordered_extent *ordered; - struct btrfs_ordered_inode_tree *tree = &BTRFS_I(inode)->ordered_tree; + struct btrfs_ordered_inode_tree *tree = &inode->ordered_tree; unsigned long num_sectors; unsigned long i; - u32 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + u32 sectorsize = btrfs_inode_sectorsize(inode); + const u8 blocksize_bits = inode->vfs_inode.i_sb->s_blocksize_bits; const u16 csum_size = btrfs_super_csum_size(fs_info->super_copy); int index = 0; - ordered = btrfs_lookup_ordered_extent(BTRFS_I(inode), offset); + ordered = btrfs_lookup_ordered_extent(inode, offset); if (!ordered) return 0; @@ -882,10 +883,8 @@ int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, list_for_each_entry_reverse(ordered_sum, &ordered->list, list) { if (disk_bytenr >= ordered_sum->bytenr && disk_bytenr < ordered_sum->bytenr + ordered_sum->len) { - i = (disk_bytenr - ordered_sum->bytenr) >> - inode->i_sb->s_blocksize_bits; - num_sectors = ordered_sum->len >> - inode->i_sb->s_blocksize_bits; + i = (disk_bytenr - ordered_sum->bytenr) >> blocksize_bits; + num_sectors = ordered_sum->len >> blocksize_bits; num_sectors = min_t(int, len - index, num_sectors - i); memcpy(sum + index, ordered_sum->sums + i * csum_size, num_sectors * csum_size); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 1610195c6097..6a1d5bf5aee3 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -185,8 +185,8 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_range( u64 len); void btrfs_get_ordered_extents_for_logging(struct btrfs_inode *inode, struct list_head *list); -int btrfs_find_ordered_sum(struct inode *inode, u64 offset, u64 disk_bytenr, - u8 *sum, int len); +int btrfs_find_ordered_sum(struct btrfs_inode *inode, u64 offset, + u64 disk_bytenr, u8 *sum, int len); u64 btrfs_wait_ordered_extents(struct btrfs_root *root, u64 nr, const u64 range_start, const u64 range_len); void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, From 998acfe8ffc14812ad427081b156dc286901ed9a Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:47 +0300 Subject: [PATCH 189/271] btrfs: make copy_inline_to_page take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 7126f94cf216..2461be6ccb6f 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -45,19 +45,20 @@ static int clone_finish_inode_update(struct btrfs_trans_handle *trans, return ret; } -static int copy_inline_to_page(struct inode *inode, +static int copy_inline_to_page(struct btrfs_inode *inode, const u64 file_offset, char *inline_data, const u64 size, const u64 datal, const u8 comp_type) { - const u64 block_size = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 block_size = btrfs_inode_sectorsize(inode); const u64 range_end = file_offset + block_size - 1; const size_t inline_size = size - btrfs_file_extent_calc_inline_size(0); char *data_start = inline_data + btrfs_file_extent_calc_inline_size(0); struct extent_changeset *data_reserved = NULL; struct page *page = NULL; + struct address_space *mapping = inode->vfs_inode.i_mapping; int ret; ASSERT(IS_ALIGNED(file_offset, block_size)); @@ -68,24 +69,23 @@ static int copy_inline_to_page(struct inode *inode, * reservation here. Also we must not do the reservation while holding * a transaction open, otherwise we would deadlock. */ - ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - file_offset, block_size); + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, file_offset, + block_size); if (ret) goto out; - page = find_or_create_page(inode->i_mapping, file_offset >> PAGE_SHIFT, - btrfs_alloc_write_mask(inode->i_mapping)); + page = find_or_create_page(mapping, file_offset >> PAGE_SHIFT, + btrfs_alloc_write_mask(mapping)); if (!page) { ret = -ENOMEM; goto out_unlock; } set_page_extent_mapped(page); - clear_extent_bit(&BTRFS_I(inode)->io_tree, file_offset, range_end, + clear_extent_bit(&inode->io_tree, file_offset, range_end, EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, 0, 0, NULL); - ret = btrfs_set_extent_delalloc(BTRFS_I(inode), file_offset, range_end, - 0, NULL); + ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -134,9 +134,9 @@ static int copy_inline_to_page(struct inode *inode, put_page(page); } if (ret) - btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, - file_offset, block_size, true); - btrfs_delalloc_release_extents(BTRFS_I(inode), block_size); + btrfs_delalloc_release_space(inode, data_reserved, file_offset, + block_size, true); + btrfs_delalloc_release_extents(inode, block_size); out: extent_changeset_free(data_reserved); @@ -167,8 +167,8 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -194,7 +194,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(dst, new_key->offset, + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, inline_data, size, datal, comp_type); goto out; @@ -213,8 +213,8 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } @@ -231,8 +231,8 @@ static int clone_copy_inline_extent(struct inode *dst, * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(dst, new_key->offset, inline_data, - size, datal, comp_type); + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); goto out; } From 948dfeb86baef1bfa108e93acac5cee23d578e62 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:48 +0300 Subject: [PATCH 190/271] btrfs: make btrfs_zero_range_check_range_boundary take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 30c511d1e70b..af4eab9cbc51 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -3105,15 +3105,15 @@ enum { RANGE_BOUNDARY_HOLE, }; -static int btrfs_zero_range_check_range_boundary(struct inode *inode, +static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, u64 offset) { - const u64 sectorsize = btrfs_inode_sectorsize(BTRFS_I(inode)); + const u64 sectorsize = btrfs_inode_sectorsize(inode); struct extent_map *em; int ret; offset = round_down(offset, sectorsize); - em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize); + em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize); if (IS_ERR(em)) return PTR_ERR(em); @@ -3228,7 +3228,8 @@ static int btrfs_zero_range(struct inode *inode, * to cover them. */ if (!IS_ALIGNED(offset, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, offset); + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), + offset); if (ret < 0) goto out; if (ret == RANGE_BOUNDARY_HOLE) { @@ -3244,7 +3245,7 @@ static int btrfs_zero_range(struct inode *inode, } if (!IS_ALIGNED(offset + len, sectorsize)) { - ret = btrfs_zero_range_check_range_boundary(inode, + ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode), offset + len); if (ret < 0) goto out; From facee0a09c15805202e37d12e0fecce1d28b4d5f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 31 Aug 2020 14:42:49 +0300 Subject: [PATCH 191/271] btrfs: make extent_fiemap take btrfs_inode Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 28 +++++++++++++--------------- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 15 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 5206ae72594b..a1e070ec7ad8 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -4696,7 +4696,7 @@ static int emit_last_fiemap_cache(struct fiemap_extent_info *fieinfo, return ret; } -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; @@ -4707,12 +4707,12 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 last; u64 last_for_get_extent = 0; u64 disko = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); struct btrfs_key found_key; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct fiemap_cache cache = { 0 }; struct ulist *roots; struct ulist *tmp_ulist; @@ -4736,15 +4736,15 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } - start = round_down(start, btrfs_inode_sectorsize(BTRFS_I(inode))); - len = round_up(max, btrfs_inode_sectorsize(BTRFS_I(inode))) - start; + start = round_down(start, btrfs_inode_sectorsize(inode)); + len = round_up(max, btrfs_inode_sectorsize(inode)) - start; /* * lookup the last file extent. We're not using i_size here * because there might be preallocation past i_size */ - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), -1, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), -1, + 0); if (ret < 0) { goto out_free_ulist; } else { @@ -4758,7 +4758,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, found_type = found_key.type; /* No extents, but there might be delalloc bits */ - if (found_key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (found_key.objectid != btrfs_ino(inode) || found_type != BTRFS_EXTENT_DATA_KEY) { /* have to trust i_size as the end */ last = (u64)-1; @@ -4784,10 +4784,10 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, last_for_get_extent = isize; } - lock_extent_bits(&BTRFS_I(inode)->io_tree, start, start + len - 1, + lock_extent_bits(&inode->io_tree, start, start + len - 1, &cached_state); - em = get_extent_skip_holes(BTRFS_I(inode), start, last_for_get_extent); + em = get_extent_skip_holes(inode, start, last_for_get_extent); if (!em) goto out; if (IS_ERR(em)) { @@ -4853,8 +4853,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, * then we're just getting a count and we can skip the * lookup stuff. */ - ret = btrfs_check_shared(root, - btrfs_ino(BTRFS_I(inode)), + ret = btrfs_check_shared(root, btrfs_ino(inode), bytenr, roots, tmp_ulist); if (ret < 0) goto out_free; @@ -4876,8 +4875,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, } /* now scan forward to see if this is really the last extent. */ - em = get_extent_skip_holes(BTRFS_I(inode), off, - last_for_get_extent); + em = get_extent_skip_holes(inode, off, last_for_get_extent); if (IS_ERR(em)) { ret = PTR_ERR(em); goto out; @@ -4899,7 +4897,7 @@ int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, ret = emit_last_fiemap_cache(fieinfo, &cache); free_extent_map(em); out: - unlock_extent_cached(&BTRFS_I(inode)->io_tree, start, start + len - 1, + unlock_extent_cached(&inode->io_tree, start, start + len - 1, &cached_state); out_free_ulist: diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 9e1e22f1586a..06611947a9f7 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -203,7 +203,7 @@ int extent_writepages(struct address_space *mapping, int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); void extent_readahead(struct readahead_control *rac); -int extent_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, +int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); void set_page_extent_mapped(struct page *page); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 6201bbb4ca90..a66145aac710 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8035,7 +8035,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - return extent_fiemap(inode, fieinfo, start, len); + return extent_fiemap(BTRFS_I(inode), fieinfo, start, len); } int btrfs_readpage(struct file *file, struct page *page) From ca10845a56856fff4de3804c85e6424d0f6d0cde Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Sep 2020 08:09:01 -0400 Subject: [PATCH 192/271] btrfs: sysfs: init devices outside of the chunk_mutex While running btrfs/061, btrfs/073, btrfs/078, or btrfs/178 we hit the following lockdep splat: ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc3+ #4 Not tainted ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff96ecc22ef4a0 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc+0x37/0x270 alloc_inode+0x82/0xb0 iget_locked+0x10d/0x2c0 kernfs_get_inode+0x1b/0x130 kernfs_get_tree+0x136/0x240 sysfs_get_tree+0x16/0x40 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (kernfs_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 kernfs_add_one+0x23/0x150 kernfs_create_link+0x63/0xa0 sysfs_do_create_link_sd+0x5e/0xd0 btrfs_sysfs_add_devices_dir+0x81/0x130 btrfs_init_new_device+0x67f/0x1250 btrfs_ioctl+0x1ef/0x2e20 __x64_sys_ioctl+0x83/0xb0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_insert_empty_items+0x64/0xb0 btrfs_insert_delayed_items+0x90/0x4f0 btrfs_commit_inode_delayed_items+0x93/0x140 btrfs_log_inode+0x5de/0x2020 btrfs_log_inode_parent+0x429/0xc90 btrfs_log_new_name+0x95/0x9b btrfs_rename2+0xbb9/0x1800 vfs_rename+0x64f/0x9f0 do_renameat2+0x320/0x4e0 __x64_sys_rename+0x1f/0x30 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> kernfs_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(kernfs_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffff8dd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff8dd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff96ed2ade30e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 0 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #4 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb8 check_noncircular+0x12d/0x150 __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa7/0x3d0 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x41/0x50 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This happens because we are holding the chunk_mutex at the time of adding in a new device. However we only need to hold the device_list_mutex, as we're going to iterate over the fs_devices devices. Move the sysfs init stuff outside of the chunk_mutex to get rid of this lockdep splat. CC: stable@vger.kernel.org # 4.4.x: f3cd2c58110dad14e: btrfs: sysfs, rename device_link add/remove functions CC: stable@vger.kernel.org # 4.4.x Reported-by: David Sterba Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9d169cba8514..c86ffad04641 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2597,9 +2597,6 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_set_super_num_devices(fs_info->super_copy, orig_super_num_devices + 1); - /* add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); - /* * we've got more storage, clear any full flags on the space * infos @@ -2607,6 +2604,10 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path btrfs_clear_space_info_full(fs_info); mutex_unlock(&fs_info->chunk_mutex); + + /* Add sysfs device entry */ + btrfs_sysfs_add_devices_dir(fs_devices, device); + mutex_unlock(&fs_devices->device_list_mutex); if (seeding_dev) { From c3e1f96c37d0f863ac6ddcafac6921fd46f1aa37 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 25 Aug 2020 10:02:32 -0500 Subject: [PATCH 193/271] btrfs: enumerate the type of exclusive operation in progress Instead of using a flag bit for exclusive operation, use a variable to store which exclusive operation is being performed. Introduce an API to start and finish an exclusive operation. This would enable another way for tools to check which operation is running on why starting an exclusive operation failed. The followup patch adds a sysfs_notify() to alert userspace when the state changes, so userspace can perform select() on it to get notified of the change. This would enable us to enqueue a command which will wait for current exclusive operation to complete before issuing the next exclusive operation. This has been done synchronously as opposed to a background process, or else error collection (if any) will become difficult. Reviewed-by: Nikolay Borisov Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba [ update comments ] Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 24 ++++++++++++++++++----- fs/btrfs/dev-replace.c | 4 ++-- fs/btrfs/inode.c | 14 +++++++------- fs/btrfs/ioctl.c | 44 ++++++++++++++++++++++++++---------------- fs/btrfs/volumes.c | 20 +++++++++---------- 5 files changed, 65 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index af2bd059daae..98c5f6178efc 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -541,11 +541,6 @@ enum { BTRFS_FS_QUOTA_OVERRIDE, /* Used to record internally whether fs has been frozen */ BTRFS_FS_FROZEN, - /* - * Indicate that a whole-filesystem exclusive operation is running - * (device replace, resize, device add/delete, balance) - */ - BTRFS_FS_EXCL_OP, /* * Indicate that balance has been set up from the ioctl and is in the * main phase. The fs_info::balance_ctl is initialized. @@ -566,6 +561,19 @@ enum { BTRFS_FS_DISCARD_RUNNING, }; +/* + * Exclusive operations (device replace, resize, device add/remove, balance) + */ +enum btrfs_exclusive_operation { + BTRFS_EXCLOP_NONE, + BTRFS_EXCLOP_BALANCE, + BTRFS_EXCLOP_DEV_ADD, + BTRFS_EXCLOP_DEV_REMOVE, + BTRFS_EXCLOP_DEV_REPLACE, + BTRFS_EXCLOP_RESIZE, + BTRFS_EXCLOP_SWAP_ACTIVATE, +}; + struct btrfs_fs_info { u8 chunk_tree_uuid[BTRFS_UUID_SIZE]; unsigned long flags; @@ -937,6 +945,9 @@ struct btrfs_fs_info { */ int send_in_progress; + /* Type of exclusive operation running */ + unsigned long exclusive_operation; + #ifdef CONFIG_BTRFS_FS_REF_VERIFY spinlock_t ref_verify_lock; struct rb_root block_tree; @@ -3032,6 +3043,9 @@ void btrfs_get_block_group_info(struct list_head *groups_list, struct btrfs_ioctl_space_info *space); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type); +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info); /* file.c */ int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 61d800a149b0..14901ca2508d 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -1025,7 +1025,7 @@ int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info) * should never allow both to start and pause. We don't want to allow * dev-replace to start anyway. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { down_write(&dev_replace->rwsem); dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED; @@ -1062,7 +1062,7 @@ static int btrfs_dev_replace_kthread(void *data) ret = btrfs_dev_replace_finishing(fs_info, ret); WARN_ON(ret && ret != -ECANCELED); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return 0; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a66145aac710..cce6f8789a4e 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10015,14 +10015,14 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, /* * Balance or device remove/replace/resize can move stuff around from - * under us. The EXCL_OP flag makes sure they aren't running/won't run - * concurrently while we are mapping the swap extents, and - * fs_info->swapfile_pins prevents them from running while the swap file - * is active and moving the extents. Note that this also prevents a - * concurrent device add which isn't actually necessary, but it's not + * under us. The exclop protection makes sure they aren't running/won't + * run concurrently while we are mapping the swap extents, and + * fs_info->swapfile_pins prevents them from running while the swap + * file is active and moving the extents. Note that this also prevents + * a concurrent device add which isn't actually necessary, but it's not * really worth the trouble to allow it. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_SWAP_ACTIVATE)) { btrfs_warn(fs_info, "cannot activate swapfile while exclusive operation is running"); return -EBUSY; @@ -10168,7 +10168,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, if (ret) btrfs_swap_deactivate(file); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (ret) return ret; diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index b91444e810a5..f30231596fe0 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -378,6 +378,17 @@ static int check_xflags(unsigned int flags) return 0; } +bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, + enum btrfs_exclusive_operation type) +{ + return !cmpxchg(&fs_info->exclusive_operation, BTRFS_EXCLOP_NONE, type); +} + +void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) +{ + WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); +} + /* * Set the xflags from the internal inode flags. The remaining items of fsxattr * are zeroed. @@ -1639,7 +1650,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_RESIZE)) { mnt_drop_write_file(file); return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } @@ -1753,7 +1764,7 @@ static noinline int btrfs_ioctl_resize(struct file *file, out_free: kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); mnt_drop_write_file(file); return ret; } @@ -3127,7 +3138,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; vol_args = memdup_user(arg, sizeof(*vol_args)); @@ -3144,7 +3155,7 @@ static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg) kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -3173,7 +3184,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out; } @@ -3184,7 +3195,7 @@ static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg) vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0'; ret = btrfs_rm_device(fs_info, vol_args->name, 0); } - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); if (!ret) { if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) @@ -3215,7 +3226,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) if (ret) return ret; - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REMOVE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; goto out_drop_write; } @@ -3233,7 +3244,7 @@ static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg) btrfs_info(fs_info, "disk deleted %s", vol_args->name); kfree(vol_args); out: - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out_drop_write: mnt_drop_write_file(file); @@ -3737,11 +3748,11 @@ static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info, ret = -EROFS; goto out; } - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) { ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS; } else { ret = btrfs_dev_replace_by_ioctl(fs_info, p); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } break; case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS: @@ -3952,7 +3963,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) return ret; again: - if (!test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) { + if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) { mutex_lock(&fs_info->balance_mutex); need_unlock = true; goto locked; @@ -3998,7 +4009,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) } locked: - BUG_ON(!test_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)); if (arg) { bargs = memdup_user(arg, sizeof(*bargs)); @@ -4053,10 +4063,10 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) do_balance: /* - * Ownership of bctl and filesystem flag BTRFS_FS_EXCL_OP goes to - * btrfs_balance. bctl is freed in reset_balance_state, or, if - * restriper was paused all the way until unmount, in free_fs_info. - * The flag should be cleared after reset_balance_state. + * Ownership of bctl and exclusive operation goes to btrfs_balance. + * bctl is freed in reset_balance_state, or, if restriper was paused + * all the way until unmount, in free_fs_info. The flag should be + * cleared after reset_balance_state. */ need_unlock = false; @@ -4075,7 +4085,7 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg) out_unlock: mutex_unlock(&fs_info->balance_mutex); if (need_unlock) - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); out: mnt_drop_write_file(file); return ret; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c86ffad04641..cbebb5d06773 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -291,8 +291,8 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * balance_mutex * * - * Exclusive operations, BTRFS_FS_EXCL_OP - * ====================================== + * Exclusive operations + * ==================== * * Maintains the exclusivity of the following operations that apply to the * whole filesystem and cannot run in parallel. @@ -318,11 +318,11 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, * - system power-cycle and filesystem mounted as read-only * - filesystem or device errors leading to forced read-only * - * BTRFS_FS_EXCL_OP flag is set and cleared using atomic operations. - * During the course of Paused state, the BTRFS_FS_EXCL_OP remains set. + * The status of exclusive operation is set and cleared atomically. + * During the course of Paused state, fs_info::exclusive_operation remains set. * A device operation in Paused or Running state can be canceled or resumed * either by ioctl (Balance only) or when remounted as read-write. - * BTRFS_FS_EXCL_OP flag is cleared when the device operation is canceled or + * The exclusive status is cleared when the device operation is canceled or * completed. */ @@ -4033,7 +4033,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, /* * rw_devices will not change at the moment, device add/delete/replace - * are excluded by EXCL_OP + * are exclusive */ num_devices = fs_info->fs_devices->rw_devices; @@ -4169,7 +4169,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, if ((ret && ret != -ECANCELED && ret != -ENOSPC) || balance_need_close(fs_info)) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); } wake_up(&fs_info->balance_wait_q); @@ -4180,7 +4180,7 @@ int btrfs_balance(struct btrfs_fs_info *fs_info, reset_balance_state(fs_info); else kfree(bctl); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); return ret; } @@ -4282,7 +4282,7 @@ int btrfs_recover_balance(struct btrfs_fs_info *fs_info) * is in a paused state and must have fs_info::balance_ctl properly * set up. */ - if (test_and_set_bit(BTRFS_FS_EXCL_OP, &fs_info->flags)) + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) btrfs_warn(fs_info, "balance: cannot set exclusive op status, resume manually"); @@ -4364,7 +4364,7 @@ int btrfs_cancel_balance(struct btrfs_fs_info *fs_info) if (fs_info->balance_ctl) { reset_balance_state(fs_info); - clear_bit(BTRFS_FS_EXCL_OP, &fs_info->flags); + btrfs_exclop_finish(fs_info); btrfs_info(fs_info, "balance: canceled"); } } From 66a2823c54367b4d366cfa5dc12ef99ef560f428 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Tue, 25 Aug 2020 10:02:33 -0500 Subject: [PATCH 194/271] btrfs: sysfs: export currently running exclusive operation /sys/fs//exclusive_operation contains the currently executing exclusive operation. Add a sysfs_notify() when operation end, so userspace can be notified of exclusive operation is finished. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ioctl.c | 1 + fs/btrfs/sysfs.c | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 38 insertions(+) diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index f30231596fe0..8091324341a5 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -387,6 +387,7 @@ bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) { WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE); + sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } /* diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 438a367371c4..eb8f23578c16 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -809,6 +809,42 @@ static ssize_t btrfs_checksum_show(struct kobject *kobj, BTRFS_ATTR(, checksum, btrfs_checksum_show); +static ssize_t btrfs_exclusive_operation_show(struct kobject *kobj, + struct kobj_attribute *a, char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + const char *str; + + switch (READ_ONCE(fs_info->exclusive_operation)) { + case BTRFS_EXCLOP_NONE: + str = "none\n"; + break; + case BTRFS_EXCLOP_BALANCE: + str = "balance\n"; + break; + case BTRFS_EXCLOP_DEV_ADD: + str = "device add\n"; + break; + case BTRFS_EXCLOP_DEV_REMOVE: + str = "device remove\n"; + break; + case BTRFS_EXCLOP_DEV_REPLACE: + str = "device replace\n"; + break; + case BTRFS_EXCLOP_RESIZE: + str = "resize\n"; + break; + case BTRFS_EXCLOP_SWAP_ACTIVATE: + str = "swap activate\n"; + break; + default: + str = "UNKNOWN\n"; + break; + } + return scnprintf(buf, PAGE_SIZE, "%s", str); +} +BTRFS_ATTR(, exclusive_operation, btrfs_exclusive_operation_show); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -817,6 +853,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, quota_override), BTRFS_ATTR_PTR(, metadata_uuid), BTRFS_ATTR_PTR(, checksum), + BTRFS_ATTR_PTR(, exclusive_operation), NULL, }; From 457f1864b569c4a5d49046fb9a88ab82e108cdfd Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 3 Sep 2020 14:29:51 -0400 Subject: [PATCH 195/271] btrfs: pretty print leaked root name I'm a actual human being so am incapable of converting u64 to s64 in my head, so add a helper to get the pretty name of a root objectid and use that helper to spit out the name for any special roots for leaked roots, so I don't have to scratch my head and figure out which root I messed up the refs for. Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ++++-- fs/btrfs/print-tree.c | 38 ++++++++++++++++++++++++++++++++++++++ fs/btrfs/print-tree.h | 4 ++++ 3 files changed, 46 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ed887242f348..f44f3d672a1f 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -1503,10 +1503,12 @@ void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info) struct btrfs_root *root; while (!list_empty(&fs_info->allocated_roots)) { + char buf[BTRFS_ROOT_NAME_BUF_LEN]; + root = list_first_entry(&fs_info->allocated_roots, struct btrfs_root, leak_list); - btrfs_err(fs_info, "leaked root %llu-%llu refcount %d", - root->root_key.objectid, root->root_key.offset, + btrfs_err(fs_info, "leaked root %s refcount %d", + btrfs_root_name(root->root_key.objectid, buf), refcount_read(&root->refs)); while (refcount_read(&root->refs) > 1) btrfs_put_root(root); diff --git a/fs/btrfs/print-tree.c b/fs/btrfs/print-tree.c index 80567c11ec12..7695c4783d33 100644 --- a/fs/btrfs/print-tree.c +++ b/fs/btrfs/print-tree.c @@ -7,6 +7,44 @@ #include "disk-io.h" #include "print-tree.h" +struct root_name_map { + u64 id; + char name[16]; +}; + +static const struct root_name_map root_map[] = { + { BTRFS_ROOT_TREE_OBJECTID, "ROOT_TREE" }, + { BTRFS_EXTENT_TREE_OBJECTID, "EXTENT_TREE" }, + { BTRFS_CHUNK_TREE_OBJECTID, "CHUNK_TREE" }, + { BTRFS_DEV_TREE_OBJECTID, "DEV_TREE" }, + { BTRFS_FS_TREE_OBJECTID, "FS_TREE" }, + { BTRFS_CSUM_TREE_OBJECTID, "CSUM_TREE" }, + { BTRFS_TREE_LOG_OBJECTID, "TREE_LOG" }, + { BTRFS_QUOTA_TREE_OBJECTID, "QUOTA_TREE" }, + { BTRFS_UUID_TREE_OBJECTID, "UUID_TREE" }, + { BTRFS_FREE_SPACE_TREE_OBJECTID, "FREE_SPACE_TREE" }, + { BTRFS_DATA_RELOC_TREE_OBJECTID, "DATA_RELOC_TREE" }, +}; + +const char *btrfs_root_name(u64 objectid, char *buf) +{ + int i; + + if (objectid == BTRFS_TREE_RELOC_OBJECTID) { + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, + "TREE_RELOC offset=%llu", objectid); + return buf; + } + + for (i = 0; i < ARRAY_SIZE(root_map); i++) { + if (root_map[i].id == objectid) + return root_map[i].name; + } + + snprintf(buf, BTRFS_ROOT_NAME_BUF_LEN, "%llu", objectid); + return buf; +} + static void print_chunk(struct extent_buffer *eb, struct btrfs_chunk *chunk) { int num_stripes = btrfs_chunk_num_stripes(eb, chunk); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index e6bb38fd75ad..78b99385a503 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,7 +6,11 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +/* Buffer size to contain tree name and possibly additional data (offset) */ +#define BTRFS_ROOT_NAME_BUF_LEN 48 + void btrfs_print_leaf(struct extent_buffer *l); void btrfs_print_tree(struct extent_buffer *c, bool follow); +const char *btrfs_root_name(u64 objectid, char *buf); #endif From 79dae17d8d44b2d15779e332180080af45df5352 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Thu, 3 Sep 2020 21:30:12 +0800 Subject: [PATCH 196/271] btrfs: improve device scanning messages Systems booting without the initramfs seems to scan an unusual kind of device path (/dev/root). And at a later time, the device is updated to the correct path. We generally print the process name and PID of the process scanning the device but we don't capture the same information if the device path is rescanned with a different pathname. The current message is too long, so drop the unnecessary UUID and add process name and PID. While at this also update the duplicate device warning to include the process name and PID so the messages are consistent CC: stable@vger.kernel.org # 4.19+ Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=89721 Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index cbebb5d06773..13b394d401f9 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -941,16 +941,18 @@ static noinline struct btrfs_device *device_list_add(const char *path, bdput(path_bdev); mutex_unlock(&fs_devices->device_list_mutex); btrfs_warn_in_rcu(device->fs_info, - "duplicate device fsid:devid for %pU:%llu old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "duplicate device %s devid %llu generation %llu scanned by %s (%d)", + path, devid, found_transid, + current->comm, + task_pid_nr(current)); return ERR_PTR(-EEXIST); } bdput(path_bdev); btrfs_info_in_rcu(device->fs_info, - "device fsid %pU devid %llu moved old:%s new:%s", - disk_super->fsid, devid, - rcu_str_deref(device->name), path); + "devid %llu device path %s changed to %s scanned by %s (%d)", + devid, rcu_str_deref(device->name), + path, current->comm, + task_pid_nr(current)); } name = rcu_string_strdup(path, GFP_NOFS); From c6a5d954950c5031444173ad2195efc163afcac9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:22 +0800 Subject: [PATCH 197/271] btrfs: fix replace of seed device If you replace a seed device in a sprouted fs, it appears to have successfully replaced the seed device, but if you look closely, it didn't. Here is an example. $ mkfs.btrfs /dev/sda $ btrfstune -S1 /dev/sda $ mount /dev/sda /btrfs $ btrfs device add /dev/sdb /btrfs $ umount /btrfs $ btrfs device scan --forget $ mount -o device=/dev/sda /dev/sdb /btrfs $ btrfs replace start -f /dev/sda /dev/sdc /btrfs $ echo $? 0 BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc started BTRFS info (device sdb): dev_replace from /dev/sda (devid 1) to /dev/sdc finished $ btrfs fi show Label: none uuid: ab2c88b7-be81-4a7e-9849-c3666e7f9f4f Total devices 2 FS bytes used 256.00KiB devid 1 size 3.00GiB used 520.00MiB path /dev/sdc devid 2 size 3.00GiB used 896.00MiB path /dev/sdb Label: none uuid: 10bd3202-0415-43af-96a8-d5409f310a7e Total devices 1 FS bytes used 128.00KiB devid 1 size 3.00GiB used 536.00MiB path /dev/sda So as per the replace start command and kernel log replace was successful. Now let's try to clean mount. $ umount /btrfs $ btrfs device scan --forget $ mount -o device=/dev/sdc /dev/sdb /btrfs mount: /btrfs: wrong fs type, bad option, bad superblock on /dev/sdb, missing codepage or helper program, or other error. [ 636.157517] BTRFS error (device sdc): failed to read chunk tree: -2 [ 636.180177] BTRFS error (device sdc): open_ctree failed That's because per dev items it is still looking for the original seed device. $ btrfs inspect-internal dump-tree -d /dev/sdb item 0 key (DEV_ITEMS DEV_ITEM 1) itemoff 16185 itemsize 98 devid 1 total_bytes 3221225472 bytes_used 545259520 io_align 4096 io_width 4096 sector_size 4096 type 0 generation 6 start_offset 0 dev_group 0 seek_speed 0 bandwidth 0 uuid 59368f50-9af2-4b17-91da-8a783cc418d4 <--- seed uuid fsid 10bd3202-0415-43af-96a8-d5409f310a7e <--- seed fsid item 1 key (DEV_ITEMS DEV_ITEM 2) itemoff 16087 itemsize 98 devid 2 total_bytes 3221225472 bytes_used 939524096 io_align 4096 io_width 4096 sector_size 4096 type 0 generation 0 start_offset 0 dev_group 0 seek_speed 0 bandwidth 0 uuid 56a0a6bc-4630-4998-8daf-3c3030c4256a <- sprout uuid fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f <- sprout fsid But the replaced target has the following uuid+fsid in its superblock which doesn't match with the expected uuid+fsid in its devitem. $ btrfs in dump-super /dev/sdc | egrep '^generation|dev_item.uuid|dev_item.fsid|devid' generation 20 dev_item.uuid 59368f50-9af2-4b17-91da-8a783cc418d4 dev_item.fsid ab2c88b7-be81-4a7e-9849-c3666e7f9f4f [match] dev_item.devid 1 So if you provide the original seed device the mount shall be successful. Which so long happening in the test case btrfs/163. $ btrfs device scan --forget $ mount -o device=/dev/sda /dev/sdb /btrfs Fix in this patch: If a seed is not sprouted then there is no replacement of it, because of its read-only filesystem with a read-only device. Similarly, in the case of a sprouted filesystem, the seed device is still read only. So, mark it as you can't replace a seed device, you can only add a new device and then delete the seed device. If replace is attempted then returns -EINVAL. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 14901ca2508d..bbb3d5ec9ca8 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -230,7 +230,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, int ret = 0; *device_out = NULL; - if (fs_info->fs_devices->seeding) { + if (srcdev->fs_devices->seeding) { btrfs_err(fs_info, "the filesystem is a seed filesystem!"); return -EINVAL; } From 178a16c94041fe5ebf86c5620094e185cec27dd2 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:23 +0800 Subject: [PATCH 198/271] btrfs: add btrfs_sysfs_add_device helper btrfs_sysfs_add_devices_dir() adds device link and devid kobject (sysfs entries) for a device or all the devices in the btrfs_fs_devices. In preparation to add these sysfs entries for the seed as well, add a btrfs_sysfs_add_device() helper function and avoid code duplication. Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 82 +++++++++++++++++++++++++++++++----------------- 1 file changed, 54 insertions(+), 28 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index eb8f23578c16..8ffe5df99b3c 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1312,44 +1312,70 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +static int btrfs_sysfs_add_device(struct btrfs_device *device) { - int error = 0; - struct btrfs_device *dev; + int ret; unsigned int nofs_flag; + struct kobject *devices_kobj; + struct kobject *devinfo_kobj; + + /* + * Make sure we use the fs_info::fs_devices to fetch the kobjects even + * for the seed fs_devices + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + devinfo_kobj = device->fs_info->fs_devices->devinfo_kobj; + ASSERT(devices_kobj); + ASSERT(devinfo_kobj); nofs_flag = memalloc_nofs_save(); - list_for_each_entry(dev, &fs_devices->devices, dev_list) { - if (one_device && one_device != dev) - continue; + if (device->bdev) { + struct hd_struct *disk; + struct kobject *disk_kobj; - if (dev->bdev) { - struct hd_struct *disk; - struct kobject *disk_kobj; + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; - disk = dev->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - - error = sysfs_create_link(fs_devices->devices_kobj, - disk_kobj, disk_kobj->name); - if (error) - break; - } - - init_completion(&dev->kobj_unregister); - error = kobject_init_and_add(&dev->devid_kobj, &devid_ktype, - fs_devices->devinfo_kobj, "%llu", - dev->devid); - if (error) { - kobject_put(&dev->devid_kobj); - break; + ret = sysfs_create_link(devices_kobj, disk_kobj, disk_kobj->name); + if (ret) { + btrfs_warn(device->fs_info, + "creating sysfs device link for devid %llu failed: %d", + device->devid, ret); + goto out; } } - memalloc_nofs_restore(nofs_flag); - return error; + init_completion(&device->kobj_unregister); + ret = kobject_init_and_add(&device->devid_kobj, &devid_ktype, + devinfo_kobj, "%llu", device->devid); + if (ret) { + kobject_put(&device->devid_kobj); + btrfs_warn(device->fs_info, + "devinfo init for devid %llu failed: %d", + device->devid, ret); + } + +out: + memalloc_nofs_restore(nofs_flag); + return ret; +} + +int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device) +{ + int ret; + + if (device) + return btrfs_sysfs_add_device(device); + + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + return ret; + } + + return 0; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) From 985e233e96e56b9980d5f632fc306bfa6e03f70f Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:24 +0800 Subject: [PATCH 199/271] btrfs: add btrfs_sysfs_remove_device helper btrfs_sysfs_remove_devices_dir() removes device link and devid kobject (sysfs entries) for a device or all the devices in the btrfs_fs_devices. In preparation to remove these sysfs entries for the seed as well, add a btrfs_sysfs_remove_device() helper function and avoid code duplication. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 61 +++++++++++++++++++++--------------------------- 1 file changed, 27 insertions(+), 34 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 8ffe5df99b3c..a3bd4ba906cf 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1186,50 +1186,43 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -/* when one_device is NULL, it removes all device links */ - -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device) +static void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; + struct kobject *devices_kobj; - if (!fs_devices->devices_kobj) - return -EINVAL; + /* + * Seed fs_devices devices_kobj aren't used, fetch kobject from the + * fs_info::fs_devices. + */ + devices_kobj = device->fs_info->fs_devices->devices_kobj; + ASSERT(devices_kobj); - if (one_device) { - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } + if (device->bdev) { + disk = device->bdev->bd_part; + disk_kobj = &part_to_dev(disk)->kobj; + sysfs_remove_link(devices_kobj, disk_kobj->name); + } - if (one_device->devid_kobj.state_initialized) { - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); - } + if (device->devid_kobj.state_initialized) { + kobject_del(&device->devid_kobj); + kobject_put(&device->devid_kobj); + wait_for_completion(&device->kobj_unregister); + } +} +/* When @device is NULL, remove all devices link */ +int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device) +{ + if (device) { + btrfs_sysfs_remove_device(device); return 0; } - list_for_each_entry(one_device, &fs_devices->devices, dev_list) { - - if (one_device->bdev) { - disk = one_device->bdev->bd_part; - disk_kobj = &part_to_dev(disk)->kobj; - sysfs_remove_link(fs_devices->devices_kobj, - disk_kobj->name); - } - if (one_device->devid_kobj.state_initialized) { - kobject_del(&one_device->devid_kobj); - kobject_put(&one_device->devid_kobj); - - wait_for_completion(&one_device->kobj_unregister); - } - } + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); return 0; } From 6a416a018f1a9372a43465c366e91d202da01e11 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:25 +0800 Subject: [PATCH 200/271] btrfs: make btrfs_sysfs_remove_devices_dir return void btrfs_sysfs_remove_devices_dir() return value is unused declare it as void. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 +++----- fs/btrfs/sysfs.h | 4 ++-- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index a3bd4ba906cf..66fbb82b0bd5 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1213,18 +1213,16 @@ static void btrfs_sysfs_remove_device(struct btrfs_device *device) } /* When @device is NULL, remove all devices link */ -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) +void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device) { if (device) { btrfs_sysfs_remove_device(device); - return 0; + return; } list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); - - return 0; } static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 4217823e255c..1431dbc3c2ef 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -16,8 +16,8 @@ char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *one_device); -int btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, + struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); From cd36da2e7ec67f0da9c1efc8a87ad31bd9242ec3 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:26 +0800 Subject: [PATCH 201/271] btrfs: simplify parameters of btrfs_sysfs_add_devices_dir When we add a device we need to add it to sysfs, so instead of using the btrfs_sysfs_add_devices_dir() fs_devices argument to specify whether to add a device or all of fs_devices, call the helper function directly btrfs_sysfs_add_device() and thus make it non-static. Reviewed-by: Nikolay Borisov Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/sysfs.c | 11 ++++------- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 2 +- 4 files changed, 7 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index bbb3d5ec9ca8..c176375c2c02 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -512,7 +512,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, atomic64_set(&dev_replace->num_uncorrectable_read_errors, 0); up_write(&dev_replace->rwsem); - ret = btrfs_sysfs_add_devices_dir(tgt_device->fs_devices, tgt_device); + ret = btrfs_sysfs_add_device(tgt_device); if (ret) btrfs_err(fs_info, "kobj add dev failed %d", ret); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 66fbb82b0bd5..bc341560dc69 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1303,7 +1303,7 @@ static struct kobj_type devid_ktype = { .release = btrfs_release_devid_kobj, }; -static int btrfs_sysfs_add_device(struct btrfs_device *device) +int btrfs_sysfs_add_device(struct btrfs_device *device) { int ret; unsigned int nofs_flag; @@ -1352,13 +1352,10 @@ static int btrfs_sysfs_add_device(struct btrfs_device *device) return ret; } -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) +static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; - - if (device) - return btrfs_sysfs_add_device(device); + struct btrfs_device *device; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); @@ -1456,7 +1453,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devs = fs_info->fs_devices; struct kobject *fsid_kobj = &fs_devs->fsid_kobj; - error = btrfs_sysfs_add_devices_dir(fs_devs, NULL); + error = btrfs_sysfs_add_fs_devices(fs_devs); if (error) return error; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 1431dbc3c2ef..6296c4f6b330 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -14,8 +14,7 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); -int btrfs_sysfs_add_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *one_device); +int btrfs_sysfs_add_device(struct btrfs_device *device); void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 13b394d401f9..2aa7fd89efbf 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2608,7 +2608,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path mutex_unlock(&fs_info->chunk_mutex); /* Add sysfs device entry */ - btrfs_sysfs_add_devices_dir(fs_devices, device); + btrfs_sysfs_add_device(device); mutex_unlock(&fs_devices->device_list_mutex); From 53f8a74cbeffd45a95de5dc6d16584aadb682a31 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:27 +0800 Subject: [PATCH 202/271] btrfs: split and refactor btrfs_sysfs_remove_devices_dir Similar to btrfs_sysfs_add_devices_dir()'s refactoring, split btrfs_sysfs_remove_devices_dir() so that we don't have to use the device argument to indicate whether to free all devices or just one device. Export btrfs_sysfs_remove_device() as device operations outside of sysfs.c now calls this instead of btrfs_sysfs_remove_devices_dir(). btrfs_sysfs_remove_devices_dir() is renamed to btrfs_sysfs_remove_fs_devices() to suite its new role. Now, no one outside of sysfs.c calls btrfs_sysfs_remove_fs_devices() so it is redeclared s static. And the same function had to be moved before its first caller. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 2 +- fs/btrfs/sysfs.c | 27 +++++++++++---------------- fs/btrfs/sysfs.h | 3 +-- fs/btrfs/volumes.c | 8 ++++---- 4 files changed, 17 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index c176375c2c02..db4d4a53ab73 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -781,7 +781,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, mutex_unlock(&fs_info->fs_devices->device_list_mutex); /* replace the sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, src_device); + btrfs_sysfs_remove_device(src_device); btrfs_sysfs_update_devid(tgt_device); if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &src_device->dev_state)) btrfs_scratch_superblocks(fs_info, src_device->bdev, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index bc341560dc69..7aed0712f183 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -972,6 +972,14 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) } } +static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) +{ + struct btrfs_device *device; + + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_sysfs_remove_device(device); +} + void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) { struct kobject *fsid_kobj = &fs_info->fs_devices->fsid_kobj; @@ -999,7 +1007,7 @@ void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) addrm_unknown_feature_attrs(fs_info, false); sysfs_remove_group(fsid_kobj, &btrfs_feature_attr_group); sysfs_remove_files(fsid_kobj, btrfs_attrs); - btrfs_sysfs_remove_devices_dir(fs_info->fs_devices, NULL); + btrfs_sysfs_remove_fs_devices(fs_info->fs_devices); } static const char * const btrfs_feature_set_names[FEAT_MAX] = { @@ -1186,7 +1194,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, return 0; } -static void btrfs_sysfs_remove_device(struct btrfs_device *device) +void btrfs_sysfs_remove_device(struct btrfs_device *device) { struct hd_struct *disk; struct kobject *disk_kobj; @@ -1212,19 +1220,6 @@ static void btrfs_sysfs_remove_device(struct btrfs_device *device) } } -/* When @device is NULL, remove all devices link */ -void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device) -{ - if (device) { - btrfs_sysfs_remove_device(device); - return; - } - - list_for_each_entry(device, &fs_devices->devices, dev_list) - btrfs_sysfs_remove_device(device); -} - static ssize_t btrfs_devinfo_in_fs_metadata_show(struct kobject *kobj, struct kobj_attribute *a, char *buf) @@ -1459,7 +1454,7 @@ int btrfs_sysfs_add_mounted(struct btrfs_fs_info *fs_info) error = sysfs_create_files(fsid_kobj, btrfs_attrs); if (error) { - btrfs_sysfs_remove_devices_dir(fs_devs, NULL); + btrfs_sysfs_remove_fs_devices(fs_devs); return error; } diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 6296c4f6b330..bacef43f7267 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -15,8 +15,7 @@ enum btrfs_feature_set { char *btrfs_printable_features(enum btrfs_feature_set set, u64 flags); const char *btrfs_feature_set_name(enum btrfs_feature_set set); int btrfs_sysfs_add_device(struct btrfs_device *device); -void btrfs_sysfs_remove_devices_dir(struct btrfs_fs_devices *fs_devices, - struct btrfs_device *device); +void btrfs_sysfs_remove_device(struct btrfs_device *device); int btrfs_sysfs_add_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs); void btrfs_sysfs_update_sprout_fsid(struct btrfs_fs_devices *fs_devices); diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 2aa7fd89efbf..79fe9822196f 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -2041,7 +2041,7 @@ void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info, } int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, - u64 devid) + u64 devid) { struct btrfs_device *device; struct btrfs_fs_devices *cur_devices; @@ -2145,7 +2145,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path, if (device->bdev) { cur_devices->open_devices--; /* remove sysfs entry */ - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); } num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1; @@ -2246,7 +2246,7 @@ void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev) mutex_lock(&fs_devices->device_list_mutex); - btrfs_sysfs_remove_devices_dir(fs_devices, tgtdev); + btrfs_sysfs_remove_device(tgtdev); if (tgtdev->bdev) fs_devices->open_devices--; @@ -2682,7 +2682,7 @@ int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path return ret; error_sysfs: - btrfs_sysfs_remove_devices_dir(fs_devices, device); + btrfs_sysfs_remove_device(device); mutex_lock(&fs_info->fs_devices->device_list_mutex); mutex_lock(&fs_info->chunk_mutex); list_del_rcu(&device->dev_list); From 30b0e4e0e3f53944755257c7d02792ee7f9c072e Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:28 +0800 Subject: [PATCH 203/271] btrfs: initialize sysfs devid and device link for seed device We don't initialize the sysfs devid kobject and device-link yet for the seed devices in an sprouted filesystem. So this patch initializes the seed device devid kobject and the device link in the sysfs. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 7aed0712f183..f66632fefe0a 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -975,9 +975,15 @@ void btrfs_sysfs_remove_fsid(struct btrfs_fs_devices *fs_devs) static void btrfs_sysfs_remove_fs_devices(struct btrfs_fs_devices *fs_devices) { struct btrfs_device *device; + struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) btrfs_sysfs_remove_device(device); + + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) + btrfs_sysfs_remove_device(device); + } } void btrfs_sysfs_remove_mounted(struct btrfs_fs_info *fs_info) @@ -1351,6 +1357,7 @@ static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) { int ret; struct btrfs_device *device; + struct btrfs_fs_devices *seed; list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); @@ -1358,6 +1365,14 @@ static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) return ret; } + list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed->devices, dev_list) { + ret = btrfs_sysfs_add_device(device); + if (ret) + return ret; + } + } + return 0; } From 7ad3912a70a696197d03856749f89dad7a377bc9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:29 +0800 Subject: [PATCH 204/271] btrfs: handle errors in btrfs_sysfs_add_fs_devices btrfs_sysfs_add_fs_devices() is called by btrfs_sysfs_add_mounted(). btrfs_sysfs_add_mounted() assumes that btrfs_sysfs_add_fs_devices() will either add sysfs entries for all the devices or none. So this patch keeps up to its caller expecatation and cleans up the created sysfs entries if it has to fail at some device in the list. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index f66632fefe0a..c71db7acb89d 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1362,18 +1362,22 @@ static int btrfs_sysfs_add_fs_devices(struct btrfs_fs_devices *fs_devices) list_for_each_entry(device, &fs_devices->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) - return ret; + goto fail; } list_for_each_entry(seed, &fs_devices->seed_list, seed_list) { list_for_each_entry(device, &seed->devices, dev_list) { ret = btrfs_sysfs_add_device(device); if (ret) - return ret; + goto fail; } } return 0; + +fail: + btrfs_sysfs_remove_fs_devices(fs_devices); + return ret; } void btrfs_kobject_uevent(struct block_device *bdev, enum kobject_action action) From 2fca0db07608a303dbe9db802ecca678e358dca9 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:30 +0800 Subject: [PATCH 205/271] btrfs: reada: lock all seed/sprout devices in __reada_start_machine On an fs mounted using a sprout device, the seed fs_devices are maintained in a linked list under fs_info->fs_devices. Each seeds fs_devices also has device_list_mutex initialized to protect against the potential race with delete threads. But the delete thread (at btrfs_rm_device()) is holding the fs_info::fs_devices::device_list_mutex mutex which belongs to sprout device_list_mutex instead of seed device_list_mutex. Moreover, there aren't any significient benefits in using the seed::device_list_mutex instead of sprout::device_list_mutex. So this patch converts them of using the seed::device_list_mutex to sprout::device_list_mutex. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/reada.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/btrfs/reada.c b/fs/btrfs/reada.c index e20972230823..9d4f5316a7e8 100644 --- a/fs/btrfs/reada.c +++ b/fs/btrfs/reada.c @@ -776,13 +776,11 @@ static int reada_start_for_fsdevs(struct btrfs_fs_devices *fs_devices) do { enqueued = 0; - mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) { if (atomic_read(&device->reada_in_flight) < MAX_IN_FLIGHT) enqueued += reada_start_machine_dev(device); } - mutex_unlock(&fs_devices->device_list_mutex); total += enqueued; } while (enqueued && total < 10000); @@ -795,10 +793,13 @@ static void __reada_start_machine(struct btrfs_fs_info *fs_info) int i; u64 enqueued = 0; + mutex_lock(&fs_devices->device_list_mutex); + enqueued += reada_start_for_fsdevs(fs_devices); list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) enqueued += reada_start_for_fsdevs(seed_devs); + mutex_unlock(&fs_devices->device_list_mutex); if (enqueued == 0) return; From e17125b52b7ec1075fae408e1a1da8005116e34a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:31 +0800 Subject: [PATCH 206/271] btrfs: use sprout device_list_mutex in btrfs_init_devices_late On a mounted sprout filesystem, all threads now are using the sprout::device_list_mutex, and this is the only code using the seed::device_list_mutex. This patch converts to use the sprouts fs_info->fs_devices->device_list_mutex. The same reasoning holds true here, that device delete is holding the sprout::device_list_mutex. Signed-off-by: Anand Jain Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 79fe9822196f..beacc779dbc2 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7195,16 +7195,14 @@ void btrfs_init_devices_late(struct btrfs_fs_info *fs_info) mutex_lock(&fs_devices->device_list_mutex); list_for_each_entry(device, &fs_devices->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&fs_devices->device_list_mutex); list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - mutex_lock(&seed_devs->device_list_mutex); list_for_each_entry(device, &seed_devs->devices, dev_list) device->fs_info = fs_info; - mutex_unlock(&seed_devs->device_list_mutex); seed_devs->fs_info = fs_info; } + mutex_unlock(&fs_devices->device_list_mutex); } static u64 btrfs_dev_stats_value(const struct extent_buffer *eb, From 1888709d71806c0bdf008ea471ce9569b8efa79b Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:32 +0800 Subject: [PATCH 207/271] btrfs: remove tmp variable for list traversal in btrfs_init_dev_replace_tgtdev In the function btrfs_init_dev_replace_tgtdev(), the local variable devices is used only once, we can remove it. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index db4d4a53ab73..500e45f7aa3f 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -224,7 +224,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, { struct btrfs_device *device; struct block_device *bdev; - struct list_head *devices; struct rcu_string *name; u64 devid = BTRFS_DEV_REPLACE_DEVID; int ret = 0; @@ -244,8 +243,7 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info, sync_blockdev(bdev); - devices = &fs_info->fs_devices->devices; - list_for_each_entry(device, devices, dev_list) { + list_for_each_entry(device, &fs_info->fs_devices->devices, dev_list) { if (device->bdev == bdev) { btrfs_err(fs_info, "target device is in the filesystem!"); From e493e8f9bcb5f6cf00f94d30eddc43c5be731ff4 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:34 +0800 Subject: [PATCH 208/271] btrfs: remove unnecessary tmp variable in btrfs_assign_next_active_device() We can check the argument value directly, no need for the temporary variable. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index beacc779dbc2..78b67cc6cec1 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1962,16 +1962,13 @@ static struct btrfs_device * btrfs_find_next_active_device( * this_dev) which is active. */ void __cold btrfs_assign_next_active_device(struct btrfs_device *device, - struct btrfs_device *this_dev) + struct btrfs_device *next_device) { struct btrfs_fs_info *fs_info = device->fs_info; - struct btrfs_device *next_device; - if (this_dev) - next_device = this_dev; - else + if (!next_device) next_device = btrfs_find_next_active_device(fs_info->fs_devices, - device); + device); ASSERT(next_device); if (fs_info->sb->s_bdev && From c83b60c0e4d27363e284123afb50ff34f0ea1f1a Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:35 +0800 Subject: [PATCH 209/271] btrfs: simplify gotos in open_seed_device The function does not have a common exit block and returns immediatelly so there's no point having the goto. Remove the two cases. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 8 +++----- 1 file changed, 3 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 78b67cc6cec1..6c7c8819cb31 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -6753,19 +6753,17 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder); if (ret) { free_fs_devices(fs_devices); - fs_devices = ERR_PTR(ret); - goto out; + return ERR_PTR(ret); } if (!fs_devices->seeding) { close_fs_devices(fs_devices); free_fs_devices(fs_devices); - fs_devices = ERR_PTR(-EINVAL); - goto out; + return ERR_PTR(-EINVAL); } list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list); -out: + return fs_devices; } From 0725c0c9351d9854a61bc68f0a59d9f91d75abbd Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Sat, 5 Sep 2020 01:34:36 +0800 Subject: [PATCH 210/271] btrfs: move btrfs_dev_replace_update_device_in_mapping_tree to drop declaration The function is short and simple, we can get rid of the declaration as it's not necessary for a static function. Move it before its first caller. No functional changes. Reviewed-by: Nikolay Borisov Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/dev-replace.c | 56 ++++++++++++++++++++---------------------- 1 file changed, 26 insertions(+), 30 deletions(-) diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 500e45f7aa3f..4a0243cb9d97 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -64,10 +64,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret); -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev); static int btrfs_dev_replace_kthread(void *data); int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) @@ -628,6 +624,32 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, return ret; } +static void btrfs_dev_replace_update_device_in_mapping_tree( + struct btrfs_fs_info *fs_info, + struct btrfs_device *srcdev, + struct btrfs_device *tgtdev) +{ + struct extent_map_tree *em_tree = &fs_info->mapping_tree; + struct extent_map *em; + struct map_lookup *map; + u64 start = 0; + int i; + + write_lock(&em_tree->lock); + do { + em = lookup_extent_mapping(em_tree, start, (u64)-1); + if (!em) + break; + map = em->map_lookup; + for (i = 0; i < map->num_stripes; i++) + if (srcdev == map->stripes[i].dev) + map->stripes[i].dev = tgtdev; + start = em->start + em->len; + free_extent_map(em); + } while (start); + write_unlock(&em_tree->lock); +} + static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, int scrub_ret) { @@ -797,32 +819,6 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info, return 0; } -static void btrfs_dev_replace_update_device_in_mapping_tree( - struct btrfs_fs_info *fs_info, - struct btrfs_device *srcdev, - struct btrfs_device *tgtdev) -{ - struct extent_map_tree *em_tree = &fs_info->mapping_tree; - struct extent_map *em; - struct map_lookup *map; - u64 start = 0; - int i; - - write_lock(&em_tree->lock); - do { - em = lookup_extent_mapping(em_tree, start, (u64)-1); - if (!em) - break; - map = em->map_lookup; - for (i = 0; i < map->num_stripes; i++) - if (srcdev == map->stripes[i].dev) - map->stripes[i].dev = tgtdev; - start = em->start + em->len; - free_extent_map(em); - } while (start); - write_unlock(&em_tree->lock); -} - /* * Read progress of device replace status according to the state and last * stored position. The value format is the same as for From a31a5876fae21c602f14686f1e8985f98b153d2b Mon Sep 17 00:00:00 2001 From: YueHaibing Date: Wed, 9 Sep 2020 21:51:42 +0800 Subject: [PATCH 211/271] btrfs: remove unused function calc_global_rsv_need_space() It is not used since commit 0096420adb03 ("btrfs: do not account global reserve in can_overcommit"). Reviewed-by: Anand Jain Reviewed-by: Johannes Thumshirn Signed-off-by: YueHaibing Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/space-info.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index b733718f45d3..0f16a2ce5401 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -301,11 +301,6 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, return NULL; } -static inline u64 calc_global_rsv_need_space(struct btrfs_block_rsv *global) -{ - return (global->size << 1); -} - static u64 calc_available_free_space(struct btrfs_fs_info *fs_info, struct btrfs_space_info *space_info, enum btrfs_reserve_flush_enum flush) From 8fccebfa534c7984864394fa03608305e4929aae Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:20 +0100 Subject: [PATCH 212/271] btrfs: fix metadata reservation for fallocate that leads to transaction aborts When doing an fallocate(), specially a zero range operation, we assume that reserving 3 units of metadata space is enough, that at most we touch one leaf in subvolume/fs tree for removing existing file extent items and inserting a new file extent item. This assumption is generally true for most common use cases. However when we end up needing to remove file extent items from multiple leaves, we can end up failing with -ENOSPC and abort the current transaction, turning the filesystem to RO mode. When this happens a stack trace like the following is dumped in dmesg/syslog: [ 1500.620934] ------------[ cut here ]------------ [ 1500.620938] BTRFS: Transaction aborted (error -28) [ 1500.620973] WARNING: CPU: 2 PID: 30807 at fs/btrfs/inode.c:9724 __btrfs_prealloc_file_range+0x512/0x570 [btrfs] [ 1500.620974] Modules linked in: btrfs intel_rapl_msr intel_rapl_common kvm_intel (...) [ 1500.621010] CPU: 2 PID: 30807 Comm: xfs_io Tainted: G W 5.9.0-rc3-btrfs-next-67 #1 [ 1500.621012] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 [ 1500.621023] RIP: 0010:__btrfs_prealloc_file_range+0x512/0x570 [btrfs] [ 1500.621026] Code: 8b 40 50 f0 48 (...) [ 1500.621028] RSP: 0018:ffffb05fc8803ca0 EFLAGS: 00010286 [ 1500.621030] RAX: 0000000000000000 RBX: ffff9608af276488 RCX: 0000000000000000 [ 1500.621032] RDX: 0000000000000001 RSI: 0000000000000027 RDI: 00000000ffffffff [ 1500.621033] RBP: ffffb05fc8803d90 R08: 0000000000000001 R09: 0000000000000001 [ 1500.621035] R10: 0000000000000000 R11: 0000000000000000 R12: 0000000003200000 [ 1500.621037] R13: 00000000ffffffe4 R14: ffff9608af275fe8 R15: ffff9608af275f60 [ 1500.621039] FS: 00007fb5b2368ec0(0000) GS:ffff9608b6600000(0000) knlGS:0000000000000000 [ 1500.621041] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [ 1500.621043] CR2: 00007fb5b2366fb8 CR3: 0000000202d38005 CR4: 00000000003706e0 [ 1500.621046] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 [ 1500.621047] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 [ 1500.621049] Call Trace: [ 1500.621076] btrfs_prealloc_file_range+0x10/0x20 [btrfs] [ 1500.621087] btrfs_fallocate+0xccd/0x1280 [btrfs] [ 1500.621108] vfs_fallocate+0x14d/0x290 [ 1500.621112] ksys_fallocate+0x3a/0x70 [ 1500.621117] __x64_sys_fallocate+0x1a/0x20 [ 1500.621120] do_syscall_64+0x33/0x80 [ 1500.621123] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [ 1500.621126] RIP: 0033:0x7fb5b248c477 [ 1500.621128] Code: 89 7c 24 08 (...) [ 1500.621130] RSP: 002b:00007ffc7bee9060 EFLAGS: 00000293 ORIG_RAX: 000000000000011d [ 1500.621132] RAX: ffffffffffffffda RBX: 0000000000000002 RCX: 00007fb5b248c477 [ 1500.621134] RDX: 0000000000000000 RSI: 0000000000000010 RDI: 0000000000000003 [ 1500.621136] RBP: 0000557718faafd0 R08: 0000000000000000 R09: 0000000000000000 [ 1500.621137] R10: 0000000003200000 R11: 0000000000000293 R12: 0000000000000010 [ 1500.621139] R13: 0000557718faafb0 R14: 0000557718faa480 R15: 0000000000000003 [ 1500.621151] irq event stamp: 1026217 [ 1500.621154] hardirqs last enabled at (1026223): [] console_unlock+0x500/0x5c0 [ 1500.621156] hardirqs last disabled at (1026228): [] console_unlock+0x457/0x5c0 [ 1500.621159] softirqs last enabled at (1022486): [] __do_softirq+0x3dc/0x606 [ 1500.621161] softirqs last disabled at (1022477): [] asm_call_on_stack+0x12/0x20 [ 1500.621162] ---[ end trace 2955b08408d8b9d4 ]--- [ 1500.621167] BTRFS: error (device sdj) in __btrfs_prealloc_file_range:9724: errno=-28 No space left When we use fallocate() internally, for reserving an extent for a space cache, inode cache or relocation, we can't hit this problem since either there aren't any file extent items to remove from the subvolume tree or there is at most one. When using plain fallocate() it's very unlikely, since that would require having many file extent items representing holes for the target range and crossing multiple leafs - we attempt to increase the range (merge) of such file extent items when punching holes, so at most we end up with 2 file extent items for holes at leaf boundaries. However when using the zero range operation of fallocate() for a large range (100+ MiB for example) that's fairly easy to trigger. The following example reproducer triggers the issue: $ cat reproducer.sh #!/bin/bash umount /dev/sdj &> /dev/null mkfs.btrfs -f -n 16384 -O ^no-holes /dev/sdj > /dev/null mount /dev/sdj /mnt/sdj # Create a 100M file with many file extent items. Punch a hole every 8K # just to speedup the file creation - we could do 4K sequential writes # followed by fsync (or O_SYNC) as well, but that takes a lot of time. file_size=$((100 * 1024 * 1024)) xfs_io -f -c "pwrite -S 0xab -b 10M 0 $file_size" /mnt/sdj/foobar for ((i = 0; i < $file_size; i += 8192)); do xfs_io -c "fpunch $i 4096" /mnt/sdj/foobar done # Force a transaction commit, so the zero range operation will be forced # to COW all metadata extents it need to touch. sync xfs_io -c "fzero 0 $file_size" /mnt/sdj/foobar umount /mnt/sdj $ ./reproducer.sh wrote 104857600/104857600 bytes at offset 0 100 MiB, 10 ops; 0.0669 sec (1.458 GiB/sec and 149.3117 ops/sec) fallocate: No space left on device $ dmesg To fix this use the existing infrastructure that hole punching and extent cloning use for replacing a file range with another extent. This deals with doing the removal of file extent items and inserting the new one using an incremental approach, reserving more space when needed and always ensuring we don't leave an implicit hole in the range in case we need to do multiple iterations and a crash happens between iterations. A test case for fstests will follow up soon. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 16 +++++++++++ fs/btrfs/file.c | 38 +++++++++++++++++++------- fs/btrfs/inode.c | 68 +++++++++++++++++++++++++++++++--------------- fs/btrfs/reflink.c | 1 + 4 files changed, 91 insertions(+), 32 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 98c5f6178efc..1308a851b00f 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1202,6 +1202,22 @@ struct btrfs_clone_extent_info { u64 file_offset; char *extent_buf; u32 item_size; + /* + * Set to true when attempting to replace a file range with a new extent + * described by this structure, set to false when attempting to clone an + * existing extent into a file range. + */ + bool is_new_extent; + /* Meaningful only if is_new_extent is true. */ + int qgroup_reserved; + /* + * Meaningful only if is_new_extent is true. + * Used to track how many extent items we have already inserted in a + * subvolume tree that refer to the extent described by this structure, + * so that we know when to create a new delayed ref or update an existing + * one. + */ + int insertions; }; struct btrfs_file_private { diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index af4eab9cbc51..73827c9c7a70 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2583,7 +2583,6 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; int slot; struct btrfs_ref ref = { 0 }; - u64 ref_offset; int ret; if (clone_len == 0) @@ -2608,6 +2607,8 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); + if (clone_info->is_new_extent) + btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); @@ -2621,13 +2622,29 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, return 0; inode_add_bytes(inode, clone_len); - btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; - btrfs_init_data_ref(&ref, root->root_key.objectid, - btrfs_ino(BTRFS_I(inode)), ref_offset); - ret = btrfs_inc_extent_ref(trans, &ref); + + if (clone_info->is_new_extent && clone_info->insertions == 0) { + key.objectid = clone_info->disk_offset; + key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = clone_info->disk_len; + ret = btrfs_alloc_reserved_file_extent(trans, root, + btrfs_ino(BTRFS_I(inode)), + clone_info->file_offset, + clone_info->qgroup_reserved, + &key); + } else { + u64 ref_offset; + + btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, + clone_info->disk_offset, + clone_info->disk_len, 0); + ref_offset = clone_info->file_offset - clone_info->data_offset; + btrfs_init_data_ref(&ref, root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), ref_offset); + ret = btrfs_inc_extent_ref(trans, &ref); + } + + clone_info->insertions++; return ret; } @@ -2705,7 +2722,8 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && ret && ret != -EOPNOTSUPP) + if (clone_info && !clone_info->is_new_extent && + ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; } @@ -2800,7 +2818,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info) + if (clone_info && !clone_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index cce6f8789a4e..53bce9351cfb 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9576,11 +9576,15 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, +static struct btrfs_trans_handle *insert_prealloc_file_extent( + struct btrfs_trans_handle *trans_in, struct inode *inode, struct btrfs_key *ins, u64 file_offset) { struct btrfs_file_extent_item stack_fi; + struct btrfs_clone_extent_info extent_info; + struct btrfs_trans_handle *trans = trans_in; + struct btrfs_path *path; u64 start = ins->objectid; u64 len = ins->offset; int ret; @@ -9597,10 +9601,41 @@ static int insert_prealloc_file_extent(struct btrfs_trans_handle *trans, ret = btrfs_qgroup_release_data(BTRFS_I(inode), file_offset, len); if (ret < 0) - return ret; - return insert_reserved_file_extent(trans, BTRFS_I(inode), file_offset, - &stack_fi, ret); + return ERR_PTR(ret); + + if (trans) { + ret = insert_reserved_file_extent(trans, BTRFS_I(inode), + file_offset, &stack_fi, ret); + if (ret) + return ERR_PTR(ret); + return trans; + } + + extent_info.disk_offset = start; + extent_info.disk_len = len; + extent_info.data_offset = 0; + extent_info.data_len = len; + extent_info.file_offset = file_offset; + extent_info.extent_buf = (char *)&stack_fi; + extent_info.item_size = sizeof(stack_fi); + extent_info.is_new_extent = true; + extent_info.qgroup_reserved = ret; + extent_info.insertions = 0; + + path = btrfs_alloc_path(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = btrfs_punch_hole_range(inode, path, file_offset, + file_offset + len - 1, &extent_info, + &trans); + btrfs_free_path(path); + if (ret) + return ERR_PTR(ret); + + return trans; } + static int __btrfs_prealloc_file_range(struct inode *inode, int mode, u64 start, u64 num_bytes, u64 min_size, loff_t actual_len, u64 *alloc_hint, @@ -9623,14 +9658,6 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (trans) own_trans = false; while (num_bytes > 0) { - if (own_trans) { - trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - break; - } - } - cur_bytes = min_t(u64, num_bytes, SZ_256M); cur_bytes = max(cur_bytes, min_size); /* @@ -9642,11 +9669,8 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, cur_bytes = min(cur_bytes, last_alloc); ret = btrfs_reserve_extent(root, cur_bytes, cur_bytes, min_size, 0, *alloc_hint, &ins, 1, 0); - if (ret) { - if (own_trans) - btrfs_end_transaction(trans); + if (ret) break; - } /* * We've reserved this space, and thus converted it from @@ -9659,13 +9683,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, btrfs_dec_block_group_reservations(fs_info, ins.objectid); last_alloc = ins.offset; - ret = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); - if (ret) { + trans = insert_prealloc_file_extent(trans, inode, &ins, cur_offset); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); - btrfs_abort_transaction(trans, ret); - if (own_trans) - btrfs_end_transaction(trans); break; } @@ -9728,8 +9750,10 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, break; } - if (own_trans) + if (own_trans) { btrfs_end_transaction(trans); + trans = NULL; + } } if (clear_offset < end) btrfs_free_reserved_data_space(BTRFS_I(inode), NULL, clear_offset, diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 2461be6ccb6f..3c03126b52b1 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -463,6 +463,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.item_size = size; + clone_info.is_new_extent = false; ret = btrfs_punch_hole_range(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); From fb870f6cdd72a1f143d9d25ab864e474fed65616 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:21 +0100 Subject: [PATCH 213/271] btrfs: remove item_size member of struct btrfs_clone_extent_info The value of item_size of struct btrfs_clone_extent_info is always set to the size of a non-inline file extent item, and in fact the infrastructure that uses this structure (btrfs_punch_hole_range()) does not work with inline file extents at all (and it is not supposed to). So just remove that field from the structure and use directly sizeof(struct btrfs_file_extent_item) instead. Also assert that the file extent type is not inline at btrfs_insert_clone_extent(). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 5 +++-- fs/btrfs/inode.c | 1 - fs/btrfs/reflink.c | 1 - 4 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1308a851b00f..6200e430dfad 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1200,8 +1200,8 @@ struct btrfs_clone_extent_info { u64 data_offset; u64 data_len; u64 file_offset; + /* Pointer to a file extent item of type regular or prealloc. */ char *extent_buf; - u32 item_size; /* * Set to true when attempting to replace a file range with a new extent * described by this structure, set to false when attempting to clone an diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 73827c9c7a70..28794a98778a 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2596,15 +2596,16 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = clone_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; write_extent_buffer(leaf, clone_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), - clone_info->item_size); + sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); + ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); if (clone_info->is_new_extent) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 53bce9351cfb..a9f4875a4a52 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9617,7 +9617,6 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( extent_info.data_len = len; extent_info.file_offset = file_offset; extent_info.extent_buf = (char *)&stack_fi; - extent_info.item_size = sizeof(stack_fi); extent_info.is_new_extent = true; extent_info.qgroup_reserved = ret; extent_info.insertions = 0; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 3c03126b52b1..020da4d65b64 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -462,7 +462,6 @@ static int btrfs_clone(struct inode *src, struct inode *inode, clone_info.data_len = datal; clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; - clone_info.item_size = size; clone_info.is_new_extent = false; ret = btrfs_punch_hole_range(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, From bf385648fa4805acf206254c77916edb58dbfe25 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:22 +0100 Subject: [PATCH 214/271] btrfs: rename struct btrfs_clone_extent_info to a more generic name Now that we can use btrfs_clone_extent_info to convey information for a new prealloc extent as well, and not just for existing extents that are being cloned, rename it to btrfs_replace_extent_info, which reflects the fact that this is now more generic and it is used to replace all existing extents in a file range with the extent described by the structure. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 8 +++- fs/btrfs/file.c | 92 +++++++++++++++++++++++----------------------- fs/btrfs/inode.c | 2 +- fs/btrfs/reflink.c | 2 +- 4 files changed, 54 insertions(+), 50 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 6200e430dfad..0e4034d63111 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1194,7 +1194,11 @@ struct btrfs_root { #endif }; -struct btrfs_clone_extent_info { +/* + * Structure that conveys information about an extent that is going to replace + * all the extents in a file range. + */ +struct btrfs_replace_extent_info { u64 disk_offset; u64 disk_len; u64 data_offset; @@ -3086,7 +3090,7 @@ int btrfs_drop_extents(struct btrfs_trans_handle *trans, u64 end, int drop_cache); int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 start, u64 end); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 28794a98778a..7ac0a20119f3 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2573,8 +2573,8 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, - struct btrfs_clone_extent_info *clone_info, - const u64 clone_len) + struct btrfs_replace_extent_info *extent_info, + const u64 replace_len) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_root *root = BTRFS_I(inode)->root; @@ -2585,67 +2585,67 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, struct btrfs_ref ref = { 0 }; int ret; - if (clone_len == 0) + if (replace_len == 0) return 0; - if (clone_info->disk_offset == 0 && + if (extent_info->disk_offset == 0 && btrfs_fs_incompat(fs_info, NO_HOLES)) return 0; key.objectid = btrfs_ino(BTRFS_I(inode)); key.type = BTRFS_EXTENT_DATA_KEY; - key.offset = clone_info->file_offset; + key.offset = extent_info->file_offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_file_extent_item)); if (ret) return ret; leaf = path->nodes[0]; slot = path->slots[0]; - write_extent_buffer(leaf, clone_info->extent_buf, + write_extent_buffer(leaf, extent_info->extent_buf, btrfs_item_ptr_offset(leaf, slot), sizeof(struct btrfs_file_extent_item)); extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE); - btrfs_set_file_extent_offset(leaf, extent, clone_info->data_offset); - btrfs_set_file_extent_num_bytes(leaf, extent, clone_len); - if (clone_info->is_new_extent) + btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset); + btrfs_set_file_extent_num_bytes(leaf, extent, replace_len); + if (extent_info->is_new_extent) btrfs_set_file_extent_generation(leaf, extent, trans->transid); btrfs_mark_buffer_dirty(leaf); btrfs_release_path(path); ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), - clone_info->file_offset, clone_len); + extent_info->file_offset, replace_len); if (ret) return ret; /* If it's a hole, nothing more needs to be done. */ - if (clone_info->disk_offset == 0) + if (extent_info->disk_offset == 0) return 0; - inode_add_bytes(inode, clone_len); + inode_add_bytes(inode, replace_len); - if (clone_info->is_new_extent && clone_info->insertions == 0) { - key.objectid = clone_info->disk_offset; + if (extent_info->is_new_extent && extent_info->insertions == 0) { + key.objectid = extent_info->disk_offset; key.type = BTRFS_EXTENT_ITEM_KEY; - key.offset = clone_info->disk_len; + key.offset = extent_info->disk_len; ret = btrfs_alloc_reserved_file_extent(trans, root, btrfs_ino(BTRFS_I(inode)), - clone_info->file_offset, - clone_info->qgroup_reserved, + extent_info->file_offset, + extent_info->qgroup_reserved, &key); } else { u64 ref_offset; btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, - clone_info->disk_offset, - clone_info->disk_len, 0); - ref_offset = clone_info->file_offset - clone_info->data_offset; + extent_info->disk_offset, + extent_info->disk_len, 0); + ref_offset = extent_info->file_offset - extent_info->data_offset; btrfs_init_data_ref(&ref, root->root_key.objectid, btrfs_ino(BTRFS_I(inode)), ref_offset); ret = btrfs_inc_extent_ref(trans, &ref); } - clone_info->insertions++; + extent_info->insertions++; return ret; } @@ -2653,15 +2653,15 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, /* * The respective range must have been previously locked, as well as the inode. * The end offset is inclusive (last byte of the range). - * @clone_info is NULL for fallocate's hole punching and non-NULL for extent - * cloning. - * When cloning, we don't want to end up in a state where we dropped extents - * without inserting a new one, so we must abort the transaction to avoid a - * corruption. + * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing + * the file range with an extent. + * When not punching a hole, we don't want to end up in a state where we dropped + * extents without inserting a new one, so we must abort the transaction to avoid + * a corruption. */ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, - struct btrfs_clone_extent_info *clone_info, + struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -2690,10 +2690,10 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, /* * 1 - update the inode * 1 - removing the extents in the range - * 1 - adding the hole extent if no_holes isn't set or if we are cloning - * an extent + * 1 - adding the hole extent if no_holes isn't set or if we are + * replacing the range with a new extent */ - if (!btrfs_fs_incompat(fs_info, NO_HOLES) || clone_info) + if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info) rsv_count = 3; else rsv_count = 2; @@ -2723,7 +2723,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * returned by __btrfs_drop_extents() without having * changed anything in the file. */ - if (clone_info && !clone_info->is_new_extent && + if (extent_info && !extent_info->is_new_extent && ret && ret != -EOPNOTSUPP) btrfs_abort_transaction(trans, ret); break; @@ -2731,7 +2731,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, trans->block_rsv = &fs_info->trans_block_rsv; - if (!clone_info && cur_offset < drop_end && + if (!extent_info && cur_offset < drop_end && cur_offset < ino_size) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); @@ -2745,7 +2745,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); break; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* * We are past the i_size here, but since we didn't * insert holes we need to clear the mapped area so we @@ -2765,18 +2765,18 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info && drop_end > clone_info->file_offset) { - u64 clone_len = drop_end - clone_info->file_offset; + if (extent_info && drop_end > extent_info->file_offset) { + u64 replace_len = drop_end - extent_info->file_offset; ret = btrfs_insert_clone_extent(trans, inode, path, - clone_info, clone_len); + extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); break; } - clone_info->data_len -= clone_len; - clone_info->data_offset += clone_len; - clone_info->file_offset += clone_len; + extent_info->data_len -= replace_len; + extent_info->data_offset += replace_len; + extent_info->file_offset += replace_len; } cur_offset = drop_end; @@ -2800,7 +2800,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!clone_info) { + if (!extent_info) { ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; @@ -2819,7 +2819,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * than 16Mb would force the full fsync any way (when * try_release_extent_mapping() is invoked during page cache truncation. */ - if (clone_info && !clone_info->is_new_extent) + if (extent_info && !extent_info->is_new_extent) set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -2845,7 +2845,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, * (because it's useless) or if it represents a 0 bytes range (when * cur_offset == drop_end). */ - if (!clone_info && cur_offset < ino_size && cur_offset < drop_end) { + if (!extent_info && cur_offset < ino_size && cur_offset < drop_end) { ret = fill_holes(trans, BTRFS_I(inode), path, cur_offset, drop_end); if (ret) { @@ -2853,7 +2853,7 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, btrfs_abort_transaction(trans, ret); goto out_trans; } - } else if (!clone_info && cur_offset < drop_end) { + } else if (!extent_info && cur_offset < drop_end) { /* See the comment in the loop above for the reasoning here. */ ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), cur_offset, drop_end - cur_offset); @@ -2863,9 +2863,9 @@ int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, } } - if (clone_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, clone_info, - clone_info->data_len); + if (extent_info) { + ret = btrfs_insert_clone_extent(trans, inode, path, extent_info, + extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9f4875a4a52..77ab9eada11d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9582,7 +9582,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( u64 file_offset) { struct btrfs_file_extent_item stack_fi; - struct btrfs_clone_extent_info extent_info; + struct btrfs_replace_extent_info extent_info; struct btrfs_trans_handle *trans = trans_in; struct btrfs_path *path; u64 start = ins->objectid; diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 020da4d65b64..dc8b5397e198 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -439,7 +439,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, if (type == BTRFS_FILE_EXTENT_REG || type == BTRFS_FILE_EXTENT_PREALLOC) { - struct btrfs_clone_extent_info clone_info; + struct btrfs_replace_extent_info clone_info; /* * a | --- range to clone ---| b From 306bfec02b1054d24bbbeec49ece6db396d451e0 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:23 +0100 Subject: [PATCH 215/271] btrfs: rename btrfs_punch_hole_range() to a more generic name The function btrfs_punch_hole_range() is now used to replace all the file extents in a given file range with an extent described in the given struct btrfs_replace_extent_info argument. This extent can either be an existing extent that is being cloned or it can be a new extent (namely a prealloc extent). When that argument is NULL it only punches a hole (drops all the existing extents) in the file range. So rename the function to btrfs_replace_file_extents(). Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 +- fs/btrfs/file.c | 4 ++-- fs/btrfs/inode.c | 2 +- fs/btrfs/reflink.c | 4 ++-- 4 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0e4034d63111..5ab55f678465 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3088,7 +3088,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct inode *inode, u64 start, u64 end, int drop_cache); -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 7ac0a20119f3..241b34e44a6c 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2659,7 +2659,7 @@ static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ -int btrfs_punch_hole_range(struct inode *inode, struct btrfs_path *path, +int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, const u64 start, const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out) @@ -3007,7 +3007,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_punch_hole_range(inode, path, lockstart, lockend, NULL, + ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, &trans); btrfs_free_path(path); if (ret) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 77ab9eada11d..ff9b22b71171 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -9625,7 +9625,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( if (!path) return ERR_PTR(-ENOMEM); - ret = btrfs_punch_hole_range(inode, path, file_offset, + ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index dc8b5397e198..39b3269e5760 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -463,7 +463,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; - ret = btrfs_punch_hole_range(inode, path, drop_start, + ret = btrfs_replace_file_extents(inode, path, drop_start, new_key.offset + datal - 1, &clone_info, &trans); if (ret) @@ -533,7 +533,7 @@ static int btrfs_clone(struct inode *src, struct inode *inode, btrfs_release_path(path); path->leave_spinning = 0; - ret = btrfs_punch_hole_range(inode, path, last_dest_end, + ret = btrfs_replace_file_extents(inode, path, last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; From 0cbb5bdfea2675f4b9749f99120a79f9e0a28600 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Tue, 8 Sep 2020 11:27:24 +0100 Subject: [PATCH 216/271] btrfs: rename btrfs_insert_clone_extent() to a more generic name Now that we use the same mechanism to replace all the extents in a file range with either a hole, an existing extent (when cloning) or a new extent (when using fallocate), the name of btrfs_insert_clone_extent() no longer reflects its genericity. So rename it to btrfs_insert_replace_extent(), since what it does is to either insert an existing extent or a new extent into a file range. Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 241b34e44a6c..32ceda264b7d 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2570,7 +2570,7 @@ static int btrfs_punch_hole_lock_range(struct inode *inode, return 0; } -static int btrfs_insert_clone_extent(struct btrfs_trans_handle *trans, +static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_path *path, struct btrfs_replace_extent_info *extent_info, @@ -2768,7 +2768,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, if (extent_info && drop_end > extent_info->file_offset) { u64 replace_len = drop_end - extent_info->file_offset; - ret = btrfs_insert_clone_extent(trans, inode, path, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, replace_len); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2864,7 +2864,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_clone_extent(trans, inode, path, extent_info, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len); if (ret) { btrfs_abort_transaction(trans, ret); From a9b2e0de92cbed84b9bca1464babaf1d67a5542b Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:51 -0700 Subject: [PATCH 217/271] btrfs: send: get rid of i_size logic in send_write() send_write()/fill_read_buf() have some logic for avoiding reading past i_size. However, everywhere that we call send_write()/send_extent_data(), we've already clamped the length down to i_size. Get rid of the i_size handling, which simplifies the next change. Reviewed-by: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 37 ++++++++++--------------------------- 1 file changed, 10 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 7c7c09fc65e8..8af5e867e4ca 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -4794,7 +4794,7 @@ static int process_all_new_xattrs(struct send_ctx *sctx) return ret; } -static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4804,21 +4804,13 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - ssize_t ret = 0; + int ret = 0; + size_t read = 0; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) return PTR_ERR(inode); - if (offset + len > i_size_read(inode)) { - if (offset > i_size_read(inode)) - len = 0; - else - len = offset - i_size_read(inode); - } - if (len == 0) - goto out; - last_index = (offset + len - 1) >> PAGE_SHIFT; /* initial readahead */ @@ -4859,16 +4851,15 @@ static ssize_t fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + ret, addr + pg_offset, cur_len); + memcpy(sctx->read_buf + read, addr + pg_offset, cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - ret += cur_len; + read += cur_len; } -out: iput(inode); return ret; } @@ -4882,7 +4873,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - ssize_t num_read = 0; p = fs_path_alloc(); if (!p) @@ -4890,12 +4880,9 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - num_read = fill_read_buf(sctx, offset, len); - if (num_read <= 0) { - if (num_read < 0) - ret = num_read; + ret = fill_read_buf(sctx, offset, len); + if (ret < 0) goto out; - } ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) @@ -4907,16 +4894,14 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, num_read); + TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); ret = send_cmd(sctx); tlv_put_failure: out: fs_path_free(p); - if (ret < 0) - return ret; - return num_read; + return ret; } /* @@ -5095,9 +5080,7 @@ static int send_extent_data(struct send_ctx *sctx, ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; - if (!ret) - break; - sent += ret; + sent += size; } return 0; } From 8c7d9fe06f5bb87dce356dcc9ea7c003b529e6ba Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:52 -0700 Subject: [PATCH 218/271] btrfs: send: avoid copying file data send_write() currently copies from the page cache to sctx->read_buf, and then from sctx->read_buf to sctx->send_buf. Similarly, send_hole() zeroes sctx->read_buf and then copies from sctx->read_buf to sctx->send_buf. However, if we write the TLV header manually, we can copy to sctx->send_buf directly and get rid of sctx->read_buf. Reviewed-by: Filipe Manana Reviewed-by: Josef Bacik Signed-off-by: Omar Sandoval Signed-off-by: David Sterba --- fs/btrfs/send.c | 65 +++++++++++++++++++++++++++++-------------------- fs/btrfs/send.h | 1 - 2 files changed, 39 insertions(+), 27 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8af5e867e4ca..491c486e3960 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -122,8 +122,6 @@ struct send_ctx { struct file_ra_state ra; - char *read_buf; - /* * We process inodes by their increasing order, so if before an * incremental send we reverse the parent/child relationship of @@ -4794,7 +4792,25 @@ static int process_all_new_xattrs(struct send_ctx *sctx) return ret; } -static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) +static inline u64 max_send_read_size(const struct send_ctx *sctx) +{ + return sctx->send_max_size - SZ_16K; +} + +static int put_data_header(struct send_ctx *sctx, u32 len) +{ + struct btrfs_tlv_header *hdr; + + if (sctx->send_max_size - sctx->send_size < sizeof(*hdr) + len) + return -EOVERFLOW; + hdr = (struct btrfs_tlv_header *)(sctx->send_buf + sctx->send_size); + put_unaligned_le16(BTRFS_SEND_A_DATA, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); + sctx->send_size += sizeof(*hdr); + return 0; +} + +static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; @@ -4804,8 +4820,11 @@ static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) pgoff_t index = offset >> PAGE_SHIFT; pgoff_t last_index; unsigned pg_offset = offset_in_page(offset); - int ret = 0; - size_t read = 0; + int ret; + + ret = put_data_header(sctx, len); + if (ret) + return ret; inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root); if (IS_ERR(inode)) @@ -4851,14 +4870,15 @@ static int fill_read_buf(struct send_ctx *sctx, u64 offset, u32 len) } addr = kmap(page); - memcpy(sctx->read_buf + read, addr + pg_offset, cur_len); + memcpy(sctx->send_buf + sctx->send_size, addr + pg_offset, + cur_len); kunmap(page); unlock_page(page); put_page(page); index++; pg_offset = 0; len -= cur_len; - read += cur_len; + sctx->send_size += cur_len; } iput(inode); return ret; @@ -4880,10 +4900,6 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); - ret = fill_read_buf(sctx, offset, len); - if (ret < 0) - goto out; - ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) goto out; @@ -4894,7 +4910,9 @@ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_file_data(sctx, offset, len); + if (ret < 0) + goto out; ret = send_cmd(sctx); @@ -5013,8 +5031,8 @@ static int send_update_extent(struct send_ctx *sctx, static int send_hole(struct send_ctx *sctx, u64 end) { struct fs_path *p = NULL; + u64 read_size = max_send_read_size(sctx); u64 offset = sctx->cur_inode_last_extent; - u64 len; int ret = 0; /* @@ -5041,16 +5059,19 @@ static int send_hole(struct send_ctx *sctx, u64 end) ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); if (ret < 0) goto tlv_put_failure; - memset(sctx->read_buf, 0, BTRFS_SEND_READ_SIZE); while (offset < end) { - len = min_t(u64, end - offset, BTRFS_SEND_READ_SIZE); + u64 len = min(end - offset, read_size); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) break; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); - TLV_PUT(sctx, BTRFS_SEND_A_DATA, sctx->read_buf, len); + ret = put_data_header(sctx, len); + if (ret < 0) + break; + memset(sctx->send_buf + sctx->send_size, 0, len); + sctx->send_size += len; ret = send_cmd(sctx); if (ret < 0) break; @@ -5066,17 +5087,16 @@ static int send_extent_data(struct send_ctx *sctx, const u64 offset, const u64 len) { + u64 read_size = max_send_read_size(sctx); u64 sent = 0; if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, len); while (sent < len) { - u64 size = len - sent; + u64 size = min(len - sent, read_size); int ret; - if (size > BTRFS_SEND_READ_SIZE) - size = BTRFS_SEND_READ_SIZE; ret = send_write(sctx, offset + sent, size); if (ret < 0) return ret; @@ -7145,12 +7165,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) goto out; } - sctx->read_buf = kvmalloc(BTRFS_SEND_READ_SIZE, GFP_KERNEL); - if (!sctx->read_buf) { - ret = -ENOMEM; - goto out; - } - sctx->pending_dir_moves = RB_ROOT; sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; @@ -7354,7 +7368,6 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) kvfree(sctx->clone_roots); kvfree(sctx->send_buf); - kvfree(sctx->read_buf); name_cache_free(sctx); diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index ead397f7034f..de91488b7cd0 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -13,7 +13,6 @@ #define BTRFS_SEND_STREAM_VERSION 1 #define BTRFS_SEND_BUF_SIZE SZ_64K -#define BTRFS_SEND_READ_SIZE (48 * SZ_1K) enum btrfs_tlv_type { BTRFS_TLV_U8, From c9a949af13d6d8983de086d9ecbf34c4c94da73e Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:53 -0700 Subject: [PATCH 219/271] btrfs: send: use btrfs_file_extent_end() in send_write_or_clone() send_write_or_clone() basically has an open-coded copy of btrfs_file_extent_end() except that it (incorrectly) aligns to PAGE_SIZE instead of sectorsize. Fix and simplify the code by using btrfs_file_extent_end(). Reviewed-by: Filipe Manana Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 44 +++++++++++--------------------------------- 1 file changed, 11 insertions(+), 33 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 491c486e3960..ac0e942e8d7c 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -5400,51 +5400,29 @@ static int send_write_or_clone(struct send_ctx *sctx, struct clone_root *clone_root) { int ret = 0; - struct btrfs_file_extent_item *ei; u64 offset = key->offset; - u64 len; - u8 type; + u64 end; u64 bs = sctx->send_root->fs_info->sb->s_blocksize; - ei = btrfs_item_ptr(path->nodes[0], path->slots[0], - struct btrfs_file_extent_item); - type = btrfs_file_extent_type(path->nodes[0], ei); - if (type == BTRFS_FILE_EXTENT_INLINE) { - len = btrfs_file_extent_ram_bytes(path->nodes[0], ei); - /* - * it is possible the inline item won't cover the whole page, - * but there may be items after this page. Make - * sure to send the whole thing - */ - len = PAGE_ALIGN(len); - } else { - len = btrfs_file_extent_num_bytes(path->nodes[0], ei); - } + end = min_t(u64, btrfs_file_extent_end(path), sctx->cur_inode_size); + if (offset >= end) + return 0; - if (offset >= sctx->cur_inode_size) { - ret = 0; - goto out; - } - if (offset + len > sctx->cur_inode_size) - len = sctx->cur_inode_size - offset; - if (len == 0) { - ret = 0; - goto out; - } - - if (clone_root && IS_ALIGNED(offset + len, bs)) { + if (clone_root && IS_ALIGNED(end, bs)) { + struct btrfs_file_extent_item *ei; u64 disk_byte; u64 data_offset; + ei = btrfs_item_ptr(path->nodes[0], path->slots[0], + struct btrfs_file_extent_item); disk_byte = btrfs_file_extent_disk_bytenr(path->nodes[0], ei); data_offset = btrfs_file_extent_offset(path->nodes[0], ei); ret = clone_range(sctx, clone_root, disk_byte, data_offset, - offset, len); + offset, end - offset); } else { - ret = send_extent_data(sctx, offset, len); + ret = send_extent_data(sctx, offset, end - offset); } - sctx->cur_inode_next_write_offset = offset + len; -out: + sctx->cur_inode_next_write_offset = end; return ret; } From 7573df5547c00f067e4c0b2fd7cc99f3d3a2bda7 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Fri, 21 Aug 2020 00:39:54 -0700 Subject: [PATCH 220/271] btrfs: sysfs: export supported send stream version This reports the latest send stream version supported by the kernel as the feature in /sys/fs/btrfs/features/send_stream_version . Signed-off-by: Omar Sandoval Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/sysfs.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index c71db7acb89d..13354de56201 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -14,6 +14,7 @@ #include "ctree.h" #include "discard.h" #include "disk-io.h" +#include "send.h" #include "transaction.h" #include "sysfs.h" #include "volumes.h" @@ -321,9 +322,17 @@ static ssize_t supported_checksums_show(struct kobject *kobj, } BTRFS_ATTR(static_feature, supported_checksums, supported_checksums_show); +static ssize_t send_stream_version_show(struct kobject *kobj, + struct kobj_attribute *ka, char *buf) +{ + return snprintf(buf, PAGE_SIZE, "%d\n", BTRFS_SEND_STREAM_VERSION); +} +BTRFS_ATTR(static_feature, send_stream_version, send_stream_version_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), + BTRFS_ATTR_PTR(static_feature, send_stream_version), NULL }; From fc0716c2f6afed06a9306db3fb62cc1fcf389757 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:39:57 +0300 Subject: [PATCH 221/271] btrfs: re-arrange statements in setup_items_for_insert Rearrange statements calculating the offset of the newly added items so that the calculation has to be done only once. No functional change. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 7dbfa365eb18..36eee3aef6ff 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4832,8 +4832,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, btrfs_cpu_key_to_disk(&disk_key, cpu_key + i); btrfs_set_item_key(leaf, &disk_key, slot + i); item = btrfs_item_nr(slot + i); - btrfs_set_token_item_offset(&token, item, data_end - data_size[i]); data_end -= data_size[i]; + btrfs_set_token_item_offset(&token, item, data_end); btrfs_set_token_item_size(&token, item, data_size[i]); } From 3dc9dc8969dc738285cb7d43e852c98459c8f0a9 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:39:58 +0300 Subject: [PATCH 222/271] btrfs: eliminate total_size parameter from setup_items_for_insert The value of this argument can be derived from the total_data as it's simply the value of the data size + size of btrfs_items being touched. Move the parameter calculation inside the function. This results in a simpler interface and also a minor size reduction: ./scripts/bloat-o-meter ctree.original fs/btrfs/ctree.o add/remove: 0/0 grow/shrink: 0/3 up/down: 0/-34 (-34) Function old new delta btrfs_duplicate_item 260 259 -1 setup_items_for_insert 1200 1190 -10 btrfs_insert_empty_items 177 154 -23 Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 10 ++++------ fs/btrfs/ctree.h | 2 +- fs/btrfs/delayed-inode.c | 4 ++-- fs/btrfs/file.c | 5 +---- fs/btrfs/tests/extent-buffer-tests.c | 3 +-- fs/btrfs/tests/inode-tests.c | 6 ++---- 6 files changed, 11 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 36eee3aef6ff..a855ce0e4cb1 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4580,9 +4580,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, - item_size, item_size + - sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, new_key, &item_size, item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4762,7 +4760,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr) + u32 total_data, int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4773,6 +4771,7 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; + const u32 total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4875,8 +4874,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, - total_data, total_size, nr); + setup_items_for_insert(root, path, cpu_key, data_size, total_data, nr); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5ab55f678465..c4f9c1985ad1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2744,7 +2744,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, u32 total_size, int nr); + u32 total_data, int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0727b10a9a89..0360e07d1b0f 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -768,8 +768,8 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, - total_data_size, total_size, nitems); + setup_items_for_insert(root, path, keys, data_size, total_data_size, + nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 32ceda264b7d..fefe152884fa 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,10 +1057,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, - &extent_item_size, - extent_item_size, - sizeof(struct btrfs_item) + + setup_items_for_insert(root, path, &key, &extent_item_size, extent_item_size, 1); *key_inserted = 1; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a1b9f9b5978e..a792a1dfd6dd 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,8 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, path, &key, &value_len, value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 894a63a92236..068b6ef02cec 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,8 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -64,8 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, - value_len + sizeof(struct btrfs_item), 1); + setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); } /* From fc0d82e103c78faa98cf61e681b45618e879b63d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:39:59 +0300 Subject: [PATCH 223/271] btrfs: sink total_data parameter in setup_items_for_insert That parameter can easily be derived based on the "data_size" and "nr" parameters exploit this fact to simply the function's signature. No functional changes. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 13 +++++++++---- fs/btrfs/ctree.h | 2 +- fs/btrfs/delayed-inode.c | 3 +-- fs/btrfs/file.c | 3 +-- fs/btrfs/tests/extent-buffer-tests.c | 2 +- fs/btrfs/tests/inode-tests.c | 4 ++-- 6 files changed, 15 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a855ce0e4cb1..a0a4a885664e 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4580,7 +4580,7 @@ int btrfs_duplicate_item(struct btrfs_trans_handle *trans, return ret; path->slots[0]++; - setup_items_for_insert(root, path, new_key, &item_size, item_size, 1); + setup_items_for_insert(root, path, new_key, &item_size, 1); leaf = path->nodes[0]; memcpy_extent_buffer(leaf, btrfs_item_ptr_offset(leaf, path->slots[0]), @@ -4760,7 +4760,7 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, int nr) + int nr) { struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_item *item; @@ -4771,7 +4771,12 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, struct extent_buffer *leaf; int slot; struct btrfs_map_token token; - const u32 total_size = total_data + (nr * sizeof(struct btrfs_item)); + u32 total_size; + u32 total_data = 0; + + for (i = 0; i < nr; i++) + total_data += data_size[i]; + total_size = total_data + (nr * sizeof(struct btrfs_item)); if (path->slots[0] == 0) { btrfs_cpu_key_to_disk(&disk_key, cpu_key); @@ -4874,7 +4879,7 @@ int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, slot = path->slots[0]; BUG_ON(slot < 0); - setup_items_for_insert(root, path, cpu_key, data_size, total_data, nr); + setup_items_for_insert(root, path, cpu_key, data_size, nr); return 0; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index c4f9c1985ad1..cd644c755142 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2744,7 +2744,7 @@ static inline int btrfs_del_item(struct btrfs_trans_handle *trans, void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, - u32 total_data, int nr); + int nr); int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, const struct btrfs_key *key, void *data, u32 data_size); int btrfs_insert_empty_items(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0360e07d1b0f..5aba81e16113 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -768,8 +768,7 @@ static int btrfs_batch_insert_items(struct btrfs_root *root, } /* insert the keys of the items */ - setup_items_for_insert(root, path, keys, data_size, total_data_size, - nitems); + setup_items_for_insert(root, path, keys, data_size, nitems); /* insert the dir index items */ slot = path->slots[0]; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index fefe152884fa..2969d715c588 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1057,8 +1057,7 @@ int __btrfs_drop_extents(struct btrfs_trans_handle *trans, if (btrfs_comp_cpu_keys(&key, &slot_key) > 0) path->slots[0]++; } - setup_items_for_insert(root, path, &key, &extent_item_size, - extent_item_size, 1); + setup_items_for_insert(root, path, &key, &extent_item_size, 1); *key_inserted = 1; } diff --git a/fs/btrfs/tests/extent-buffer-tests.c b/fs/btrfs/tests/extent-buffer-tests.c index a792a1dfd6dd..df54cdfdc250 100644 --- a/fs/btrfs/tests/extent-buffer-tests.c +++ b/fs/btrfs/tests/extent-buffer-tests.c @@ -60,7 +60,7 @@ static int test_btrfs_split_item(u32 sectorsize, u32 nodesize) key.type = BTRFS_EXTENT_CSUM_KEY; key.offset = 0; - setup_items_for_insert(root, path, &key, &value_len, value_len, 1); + setup_items_for_insert(root, path, &key, &value_len, 1); item = btrfs_item_nr(0); write_extent_buffer(eb, value, btrfs_item_ptr_offset(eb, 0), value_len); diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 068b6ef02cec..cc54d4973a74 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -33,7 +33,7 @@ static void insert_extent(struct btrfs_root *root, u64 start, u64 len, key.type = BTRFS_EXTENT_DATA_KEY; key.offset = start; - setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); btrfs_set_file_extent_generation(leaf, fi, 1); btrfs_set_file_extent_type(leaf, fi, type); @@ -63,7 +63,7 @@ static void insert_inode_item_key(struct btrfs_root *root) key.type = BTRFS_INODE_ITEM_KEY; key.offset = 0; - setup_items_for_insert(root, &path, &key, &value_len, value_len, 1); + setup_items_for_insert(root, &path, &key, &value_len, 1); } /* From da9ffb242c448af20f70b431d3f04df50c6cb0d2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:40:00 +0300 Subject: [PATCH 224/271] btrfs: add kerneldoc for setup_items_for_insert Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a0a4a885664e..4fb2b0673950 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4753,10 +4753,16 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size) } } -/* - * this is a helper for btrfs_insert_empty_items, the main goal here is - * to save stack depth by doing the bulk of the work in a function - * that doesn't call btrfs_search_slot +/** + * setup_items_for_insert - Helper called before inserting one or more items + * to a leaf. Main purpose is to save stack depth by doing the bulk of the work + * in a function that doesn't call btrfs_search_slot + * + * @root: root we are inserting items to + * @path: points to the leaf/slot where we are going to insert new items + * @cpu_key: array of keys for items to be inserted + * @data_size: size of the body of each item we are going to insert + * @nr: size of @cpu_key/@data_size arrays */ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, const struct btrfs_key *cpu_key, u32 *data_size, From 7269ddd2f60286af1f5d1d47d283d2f9a26818c6 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Tue, 1 Sep 2020 17:40:01 +0300 Subject: [PATCH 225/271] btrfs: improve error message in setup_items_for_insert Reword and update formats to match variable types. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba [ update formats ] Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 4fb2b0673950..a165093739c4 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4809,7 +4809,8 @@ void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path, if (old_data < data_end) { btrfs_print_leaf(leaf); - btrfs_crit(fs_info, "slot %d old_data %d data_end %d", + btrfs_crit(fs_info, + "item at slot %d with data offset %u beyond data end of leaf %u", slot, old_data, data_end); BUG(); } From 72804905001267a93e7a470430c2bcc122932ca6 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Sep 2020 17:40:37 -0400 Subject: [PATCH 226/271] btrfs: kill the RCU protection for fs_info->space_info We have this thing wrapped in an RCU lock, but it's really not needed. We create all the space_info's on mount, and we destroy them on unmount. The list never changes and we're protected from messing with it by the normal mount/umount path, so kill the RCU stuff around it. Reviewed-by: Nikolay Borisov Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 16 ++-------------- fs/btrfs/ioctl.c | 10 ++-------- fs/btrfs/space-info.c | 14 ++++---------- fs/btrfs/super.c | 5 +---- 4 files changed, 9 insertions(+), 36 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 01e8ba1da1d3..a3b27204371c 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -2031,8 +2031,7 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) btrfs_release_path(path); } - rcu_read_lock(); - list_for_each_entry_rcu(space_info, &info->space_info, list) { + list_for_each_entry(space_info, &info->space_info, list) { if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2052,7 +2051,6 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) list) inc_block_group_ro(cache, 1); } - rcu_read_unlock(); btrfs_init_global_block_rsv(info); ret = check_chunk_block_group_mappings(info); @@ -3007,12 +3005,10 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { + list_for_each_entry(found, head, list) { if (found->flags & BTRFS_BLOCK_GROUP_METADATA) found->force_alloc = CHUNK_ALLOC_FORCE; } - rcu_read_unlock(); } static int should_alloc_chunk(struct btrfs_fs_info *fs_info, @@ -3343,14 +3339,6 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->block_group_cache_lock); - /* - * Now that all the block groups are freed, go through and free all the - * space_info structs. This is only called during the final stages of - * unmount, and so we know nobody is using them. We call - * synchronize_rcu() once before we start, just to be on the safe side. - */ - synchronize_rcu(); - btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 8091324341a5..de65cf6105e4 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -3475,15 +3475,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, struct btrfs_space_info *tmp; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; @@ -3531,15 +3528,12 @@ static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info, break; info = NULL; - rcu_read_lock(); - list_for_each_entry_rcu(tmp, &fs_info->space_info, - list) { + list_for_each_entry(tmp, &fs_info->space_info, list) { if (tmp->flags == types[i]) { info = tmp; break; } } - rcu_read_unlock(); if (!info) continue; diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 0f16a2ce5401..64099565ab8f 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -175,10 +175,8 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info) struct list_head *head = &info->space_info; struct btrfs_space_info *found; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) + list_for_each_entry(found, head, list) found->full = 0; - rcu_read_unlock(); } static int create_space_info(struct btrfs_fs_info *info, u64 flags) @@ -213,7 +211,7 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) if (ret) return ret; - list_add_rcu(&space_info->list, &info->space_info); + list_add(&space_info->list, &info->space_info); if (flags & BTRFS_BLOCK_GROUP_DATA) info->data_sinfo = space_info; @@ -290,14 +288,10 @@ struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, flags &= BTRFS_BLOCK_GROUP_TYPE_MASK; - rcu_read_lock(); - list_for_each_entry_rcu(found, head, list) { - if (found->flags & flags) { - rcu_read_unlock(); + list_for_each_entry(found, head, list) { + if (found->flags & flags) return found; - } } - rcu_read_unlock(); return NULL; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 3ebe7240c63d..8840a4fa81eb 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -2164,8 +2164,7 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) u64 thresh = 0; int mixed = 0; - rcu_read_lock(); - list_for_each_entry_rcu(found, &fs_info->space_info, list) { + list_for_each_entry(found, &fs_info->space_info, list) { if (found->flags & BTRFS_BLOCK_GROUP_DATA) { int i; @@ -2194,8 +2193,6 @@ static int btrfs_statfs(struct dentry *dentry, struct kstatfs *buf) total_used += found->disk_used; } - rcu_read_unlock(); - buf->f_blocks = div_u64(btrfs_super_total_bytes(disk_super), factor); buf->f_blocks >>= bits; buf->f_bfree = buf->f_blocks - (div_u64(total_used, factor) >> bits); From 49ea112da0e6e323bb923315ec54ee920759bf19 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 1 Sep 2020 17:40:38 -0400 Subject: [PATCH 227/271] btrfs: do not create raid sysfs entries under any locks While running xfstests btrfs/177 I got the following lockdep splat ====================================================== WARNING: possible circular locking dependency detected 5.9.0-rc3+ #5 Not tainted ------------------------------------------------------ kswapd0/100 is trying to acquire lock: ffff97066aa56760 (&delayed_node->mutex){+.+.}-{3:3}, at: __btrfs_release_delayed_node.part.0+0x3f/0x330 but task is already holding lock: ffffffff9fd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 which lock already depends on the new lock. the existing dependency chain (in reverse order) is: -> #3 (fs_reclaim){+.+.}-{0:0}: fs_reclaim_acquire+0x65/0x80 slab_pre_alloc_hook.constprop.0+0x20/0x200 kmem_cache_alloc+0x37/0x270 alloc_inode+0x82/0xb0 iget_locked+0x10d/0x2c0 kernfs_get_inode+0x1b/0x130 kernfs_get_tree+0x136/0x240 sysfs_get_tree+0x16/0x40 vfs_get_tree+0x28/0xc0 path_mount+0x434/0xc00 __x64_sys_mount+0xe3/0x120 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #2 (kernfs_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 kernfs_add_one+0x23/0x150 kernfs_create_dir_ns+0x7a/0xb0 sysfs_create_dir_ns+0x60/0xb0 kobject_add_internal+0xc0/0x2c0 kobject_add+0x6e/0x90 btrfs_sysfs_add_block_group_type+0x102/0x160 btrfs_make_block_group+0x167/0x230 btrfs_alloc_chunk+0x54f/0xb80 btrfs_chunk_alloc+0x18e/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_insert_empty_items+0x64/0xb0 btrfs_new_inode+0x225/0x730 btrfs_create+0xab/0x1f0 lookup_open.isra.0+0x52d/0x690 path_openat+0x2a7/0x9e0 do_filp_open+0x75/0x100 do_sys_openat2+0x7b/0x130 __x64_sys_openat+0x46/0x70 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #1 (&fs_info->chunk_mutex){+.+.}-{3:3}: __mutex_lock+0x7e/0x7e0 btrfs_chunk_alloc+0x125/0x3a0 find_free_extent+0xdf6/0x1210 btrfs_reserve_extent+0xb3/0x1b0 btrfs_alloc_tree_block+0xb0/0x310 alloc_tree_block_no_bg_flush+0x4a/0x60 __btrfs_cow_block+0x11a/0x530 btrfs_cow_block+0x104/0x220 btrfs_search_slot+0x52e/0x9d0 btrfs_lookup_inode+0x2a/0x8f __btrfs_update_delayed_inode+0x80/0x240 btrfs_commit_inode_delayed_inode+0x119/0x120 btrfs_evict_inode+0x357/0x500 evict+0xcf/0x1f0 do_unlinkat+0x1a9/0x2b0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 -> #0 (&delayed_node->mutex){+.+.}-{3:3}: __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 __mutex_lock+0x7e/0x7e0 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 kthread+0x138/0x160 ret_from_fork+0x1f/0x30 other info that might help us debug this: Chain exists of: &delayed_node->mutex --> kernfs_mutex --> fs_reclaim Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(fs_reclaim); lock(kernfs_mutex); lock(fs_reclaim); lock(&delayed_node->mutex); *** DEADLOCK *** 3 locks held by kswapd0/100: #0: ffffffff9fd74700 (fs_reclaim){+.+.}-{0:0}, at: __fs_reclaim_acquire+0x5/0x30 #1: ffffffff9fd65c50 (shrinker_rwsem){++++}-{3:3}, at: shrink_slab+0x115/0x290 #2: ffff9706629780e0 (&type->s_umount_key#36){++++}-{3:3}, at: super_cache_scan+0x38/0x1e0 stack backtrace: CPU: 1 PID: 100 Comm: kswapd0 Not tainted 5.9.0-rc3+ #5 Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 Call Trace: dump_stack+0x8b/0xb8 check_noncircular+0x12d/0x150 __lock_acquire+0x119c/0x1fc0 lock_acquire+0xa7/0x3d0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 __mutex_lock+0x7e/0x7e0 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? __btrfs_release_delayed_node.part.0+0x3f/0x330 ? lock_acquire+0xa7/0x3d0 ? find_held_lock+0x2b/0x80 __btrfs_release_delayed_node.part.0+0x3f/0x330 btrfs_evict_inode+0x24c/0x500 evict+0xcf/0x1f0 dispose_list+0x48/0x70 prune_icache_sb+0x44/0x50 super_cache_scan+0x161/0x1e0 do_shrink_slab+0x178/0x3c0 shrink_slab+0x17c/0x290 shrink_node+0x2b2/0x6d0 balance_pgdat+0x30a/0x670 kswapd+0x213/0x4c0 ? _raw_spin_unlock_irqrestore+0x41/0x50 ? add_wait_queue_exclusive+0x70/0x70 ? balance_pgdat+0x670/0x670 kthread+0x138/0x160 ? kthread_create_worker_on_cpu+0x40/0x40 ret_from_fork+0x1f/0x30 This happens because when we link in a block group with a new raid index type we'll create the corresponding sysfs entries for it. This is problematic because while restriping we're holding the chunk_mutex, and while mounting we're holding the tree locks. Fixing this isn't pretty, we move the call to the sysfs stuff into the btrfs_create_pending_block_groups() work, where we're not holding any locks. This creates a slight race where other threads could see that there's no sysfs kobj for that raid type, and race to create the sysfs dir. Fix this by wrapping the creation in space_info->lock, so we only get one thread calling kobject_add() for the new directory. We don't worry about the lock on cleanup as it only gets deleted on unmount. On mount it's more straightforward, we loop through the space_infos already, just check every raid index in each space_info and added the sysfs entries for the corresponding block groups. Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/block-group.c | 31 +++++++++++++++++++++++++------ fs/btrfs/sysfs.c | 25 +++++++++++++++++++++++-- 2 files changed, 48 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index a3b27204371c..c0f1d6818df7 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1766,16 +1766,10 @@ static void link_block_group(struct btrfs_block_group *cache) { struct btrfs_space_info *space_info = cache->space_info; int index = btrfs_bg_flags_to_raid_index(cache->flags); - bool first = false; down_write(&space_info->groups_sem); - if (list_empty(&space_info->block_groups[index])) - first = true; list_add_tail(&cache->list, &space_info->block_groups[index]); up_write(&space_info->groups_sem); - - if (first) - btrfs_sysfs_add_block_group_type(cache); } static struct btrfs_block_group *btrfs_create_block_group_cache( @@ -2032,6 +2026,17 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) } list_for_each_entry(space_info, &info->space_info, list) { + int i; + + for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) { + if (list_empty(&space_info->block_groups[i])) + continue; + cache = list_first_entry(&space_info->block_groups[i], + struct btrfs_block_group, + list); + btrfs_sysfs_add_block_group_type(cache); + } + if (!(btrfs_get_alloc_profile(info, space_info->flags) & (BTRFS_BLOCK_GROUP_RAID10 | BTRFS_BLOCK_GROUP_RAID1_MASK | @@ -2091,12 +2096,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) return; while (!list_empty(&trans->new_bgs)) { + int index; + block_group = list_first_entry(&trans->new_bgs, struct btrfs_block_group, bg_list); if (ret) goto next; + index = btrfs_bg_flags_to_raid_index(block_group->flags); + ret = insert_block_group_item(trans, block_group); if (ret) btrfs_abort_transaction(trans, ret); @@ -2105,6 +2114,16 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) if (ret) btrfs_abort_transaction(trans, ret); add_block_group_free_space(trans, block_group); + + /* + * If we restriped during balance, we may have added a new raid + * type, so now add the sysfs entries when it is safe to do so. + * We don't have to worry about locking here as it's handled in + * btrfs_sysfs_add_block_group_type. + */ + if (block_group->space_info->block_group_kobjs[index] == NULL) + btrfs_sysfs_add_block_group_type(block_group); + /* Already aborted the transaction if it failed. */ next: btrfs_delayed_refs_rsv_release(fs_info, 1); diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 13354de56201..279d9262b676 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -1137,17 +1137,38 @@ void btrfs_sysfs_add_block_group_type(struct btrfs_block_group *cache) rkobj->flags = cache->flags; kobject_init(&rkobj->kobj, &btrfs_raid_ktype); + + /* + * We call this either on mount, or if we've created a block group for a + * new index type while running (i.e. when restriping). The running + * case is tricky because we could race with other threads, so we need + * to have this check to make sure we didn't already init the kobject. + * + * We don't have to protect on the free side because it only happens on + * unmount. + */ + spin_lock(&space_info->lock); + if (space_info->block_group_kobjs[index]) { + spin_unlock(&space_info->lock); + kobject_put(&rkobj->kobj); + return; + } else { + space_info->block_group_kobjs[index] = &rkobj->kobj; + } + spin_unlock(&space_info->lock); + ret = kobject_add(&rkobj->kobj, &space_info->kobj, "%s", btrfs_bg_type_to_raid_name(rkobj->flags)); memalloc_nofs_restore(nofs_flag); if (ret) { + spin_lock(&space_info->lock); + space_info->block_group_kobjs[index] = NULL; + spin_unlock(&space_info->lock); kobject_put(&rkobj->kobj); btrfs_warn(fs_info, "failed to add kobject for block cache, ignoring"); return; } - - space_info->block_group_kobjs[index] = &rkobj->kobj; } /* From bb56f02f26fe23798edb1b2175707419b28c752a Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 14 Sep 2020 15:27:50 +0100 Subject: [PATCH 228/271] btrfs: reschedule if necessary when logging directory items Logging directories with many entries can take a significant amount of time, and in some cases monopolize a cpu/core for a long time if the logging task doesn't happen to block often enough. Johannes and Lu Fengqi reported test case generic/041 triggering a soft lockup when the kernel has CONFIG_SOFTLOCKUP_DETECTOR=y. For this test case we log an inode with 3002 hard links, and because the test removed one hard link before fsyncing the file, the inode logging causes the parent directory do be logged as well, which has 6004 directory items to log (3002 BTRFS_DIR_ITEM_KEY items plus 3002 BTRFS_DIR_INDEX_KEY items), so it can take a significant amount of time and trigger the soft lockup. So just make tree-log.c:log_dir_items() reschedule when necessary, releasing the current search path before doing so and then resume from where it was before the reschedule. The stack trace produced when the soft lockup happens is the following: [10480.277653] watchdog: BUG: soft lockup - CPU#2 stuck for 22s! [xfs_io:28172] [10480.279418] Modules linked in: dm_thin_pool dm_persistent_data (...) [10480.284915] irq event stamp: 29646366 [10480.285987] hardirqs last enabled at (29646365): [] __slab_alloc.constprop.0+0x56/0x60 [10480.288482] hardirqs last disabled at (29646366): [] irqentry_enter+0x1d/0x50 [10480.290856] softirqs last enabled at (4612): [] __do_softirq+0x323/0x56c [10480.293615] softirqs last disabled at (4483): [] asm_call_on_stack+0xf/0x20 [10480.296428] CPU: 2 PID: 28172 Comm: xfs_io Not tainted 5.9.0-rc4-default+ #1248 [10480.298948] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-59-gc9ba527-rebuilt.opensuse.org 04/01/2014 [10480.302455] RIP: 0010:__slab_alloc.constprop.0+0x19/0x60 [10480.304151] Code: 86 e8 31 75 21 00 66 66 2e 0f 1f 84 00 00 00 (...) [10480.309558] RSP: 0018:ffffadbe09397a58 EFLAGS: 00000282 [10480.311179] RAX: ffff8a495ab92840 RBX: 0000000000000282 RCX: 0000000000000006 [10480.313242] RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff85249b66 [10480.315260] RBP: ffff8a497d04b740 R08: 0000000000000001 R09: 0000000000000001 [10480.317229] R10: ffff8a497d044800 R11: ffff8a495ab93c40 R12: 0000000000000000 [10480.319169] R13: 0000000000000000 R14: 0000000000000c40 R15: ffffffffc01daf70 [10480.321104] FS: 00007fa1dc5c0e40(0000) GS:ffff8a497da00000(0000) knlGS:0000000000000000 [10480.323559] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [10480.325235] CR2: 00007fa1dc5befb8 CR3: 0000000004f8a006 CR4: 0000000000170ea0 [10480.327259] Call Trace: [10480.328286] ? overwrite_item+0x1f0/0x5a0 [btrfs] [10480.329784] __kmalloc+0x831/0xa20 [10480.331009] ? btrfs_get_32+0xb0/0x1d0 [btrfs] [10480.332464] overwrite_item+0x1f0/0x5a0 [btrfs] [10480.333948] log_dir_items+0x2ee/0x570 [btrfs] [10480.335413] log_directory_changes+0x82/0xd0 [btrfs] [10480.336926] btrfs_log_inode+0xc9b/0xda0 [btrfs] [10480.338374] ? init_once+0x20/0x20 [btrfs] [10480.339711] btrfs_log_inode_parent+0x8d3/0xd10 [btrfs] [10480.341257] ? dget_parent+0x97/0x2e0 [10480.342480] btrfs_log_dentry_safe+0x3a/0x50 [btrfs] [10480.343977] btrfs_sync_file+0x24b/0x5e0 [btrfs] [10480.345381] do_fsync+0x38/0x70 [10480.346483] __x64_sys_fsync+0x10/0x20 [10480.347703] do_syscall_64+0x2d/0x70 [10480.348891] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [10480.350444] RIP: 0033:0x7fa1dc80970b [10480.351642] Code: 0f 05 48 3d 00 f0 ff ff 77 45 c3 0f 1f 40 00 48 (...) [10480.356952] RSP: 002b:00007fffb3d081d0 EFLAGS: 00000293 ORIG_RAX: 000000000000004a [10480.359458] RAX: ffffffffffffffda RBX: 0000562d93d45e40 RCX: 00007fa1dc80970b [10480.361426] RDX: 0000562d93d44ab0 RSI: 0000562d93d45e60 RDI: 0000000000000003 [10480.363367] RBP: 0000000000000001 R08: 0000000000000000 R09: 00007fa1dc7b2a40 [10480.365317] R10: 0000562d93d0e366 R11: 0000000000000293 R12: 0000000000000001 [10480.367299] R13: 0000562d93d45290 R14: 0000562d93d45e40 R15: 0000562d93d45e60 Link: https://lore.kernel.org/linux-btrfs/20180713090216.GC575@fnst.localdomain/ Reported-by: Johannes Thumshirn CC: stable@vger.kernel.org # 4.4+ Tested-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Filipe Manana Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-log.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index e35cfe8cf68b..56cbc1706b6f 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -3611,6 +3611,7 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, * search and this search we'll not find the key again and can just * bail. */ +search: ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); if (ret != 0) goto done; @@ -3630,6 +3631,13 @@ static noinline int log_dir_items(struct btrfs_trans_handle *trans, if (min_key.objectid != ino || min_key.type != key_type) goto done; + + if (need_resched()) { + btrfs_release_path(path); + cond_resched(); + goto search; + } + ret = overwrite_item(trans, log, dst_path, src, i, &min_key); if (ret) { From 2f1d3e4b930dcb83ac7ec9b106d7adb4d36c4c9d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:03 +0300 Subject: [PATCH 229/271] btrfs: remove btree_readpage There is no way for this function to be called as ->readpage() since it's called from generic_file_buffered_read/filemap_fault/do_read_cache_page/readhead code. BTRFS doesn't utilize the first 3 for the btree inode and implements it's owon readhead mechanism. So simply remove the function. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f44f3d672a1f..0fae2caf31ef 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -948,11 +948,6 @@ static int btree_writepages(struct address_space *mapping, return btree_write_cache_pages(mapping, wbc); } -static int btree_readpage(struct file *file, struct page *page) -{ - return extent_read_full_page(page, btree_get_extent, 0); -} - static int btree_releasepage(struct page *page, gfp_t gfp_flags) { if (PageWriteback(page) || PageDirty(page)) @@ -992,7 +987,6 @@ static int btree_set_page_dirty(struct page *page) } static const struct address_space_operations btree_aops = { - .readpage = btree_readpage, .writepages = btree_writepages, .releasepage = btree_releasepage, .invalidatepage = btree_invalidatepage, From 0420177c08b2d86034a98d45b7d3609bdc986fd2 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:04 +0300 Subject: [PATCH 230/271] btrfs: simplify metadata pages reading Metadata pages currently use __do_readpage to read metadata pages, unfortunately this function is also used to deal with ordinary data pages. This makes the metadata pages reading code to go through multiple hoops in order to adhere to __do_readpage invariants. Most of these are necessary for data pages which could be compressed. For metadata it's enough to simply build a bio and submit it. To this effect simply call submit_extent_page directly from read_extent_buffer_pages which is the only callpath used to populate extent_buffers with data. This in turn enables further cleanups. Reviewed-by: Qu Wenruo Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 ++++++++++----------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index a1e070ec7ad8..0a6cda4c30ed 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -5573,20 +5573,19 @@ int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num) } ClearPageError(page); - err = __extent_read_full_page(page, - btree_get_extent, &bio, - mirror_num, &bio_flags, - REQ_META); + err = submit_extent_page(REQ_OP_READ | REQ_META, NULL, + page, page_offset(page), PAGE_SIZE, 0, + &bio, end_bio_extent_readpage, + mirror_num, 0, 0, false); if (err) { - ret = err; /* - * We use &bio in above __extent_read_full_page, - * so we ensure that if it returns error, the - * current page fails to add itself to bio and - * it's been unlocked. - * - * We must dec io_pages by ourselves. + * We failed to submit the bio so it's the + * caller's responsibility to perform cleanup + * i.e unlock page/set error bit. */ + ret = err; + SetPageError(page); + unlock_page(page); atomic_dec(&eb->io_pages); } } else { From 208d6341e85b4f2628a90438fce5d8a2dd97d6ba Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:05 +0300 Subject: [PATCH 231/271] btrfs: remove btree_get_extent The sole purpose of this function was to satisfy the requirements of __do_readpage. Since that function is no longer used to read metadata pages the need to keep btree_get_extent around has also disappeared. Simply remove it. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 47 ---------------------------------------------- fs/btrfs/disk-io.h | 3 --- 2 files changed, 50 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 0fae2caf31ef..53b588a7e150 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -204,53 +204,6 @@ void btrfs_set_buffer_lockdep_class(u64 objectid, struct extent_buffer *eb, #endif -/* - * extents on the btree inode are pretty simple, there's one extent - * that covers the entire device - */ -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len) -{ - struct extent_map_tree *em_tree = &inode->extent_tree; - struct extent_map *em; - int ret; - - read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); - if (em) { - read_unlock(&em_tree->lock); - goto out; - } - read_unlock(&em_tree->lock); - - em = alloc_extent_map(); - if (!em) { - em = ERR_PTR(-ENOMEM); - goto out; - } - em->start = 0; - em->len = (u64)-1; - em->block_len = (u64)-1; - em->block_start = 0; - - write_lock(&em_tree->lock); - ret = add_extent_mapping(em_tree, em, 0); - if (ret == -EEXIST) { - free_extent_map(em); - em = lookup_extent_mapping(em_tree, start, len); - if (!em) - em = ERR_PTR(-EIO); - } else if (ret) { - free_extent_map(em); - em = ERR_PTR(ret); - } - write_unlock(&em_tree->lock); - -out: - return em; -} - /* * Compute the csum of a btree block and store the result to provided buffer. */ diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 00dc39d47ed3..89b6a709a184 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -123,9 +123,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans, u64 objectid); int btree_lock_page_hook(struct page *page, void *data, void (*flush_fn)(void *)); -struct extent_map *btree_get_extent(struct btrfs_inode *inode, - struct page *page, size_t pg_offset, - u64 start, u64 len); int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags); int __init btrfs_end_io_wq_init(void); void __cold btrfs_end_io_wq_exit(void); From 1a5ee1e6260368da8347e66321e157a7de288ef3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:06 +0300 Subject: [PATCH 232/271] btrfs: remove btrfs_get_extent indirection from __do_readpage Now that this function is only responsible for reading data pages it's no longer necessary to pass get_extent_t parameter across several layers of functions. This patch removes this parameter from multiple functions: __get_extent_map/__do_readpage/__extent_read_full_page/ extent_read_full_page and simply calls btrfs_get_extent directly in __get_extent_map. Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 31 ++++++++++++------------------- fs/btrfs/extent_io.h | 3 +-- fs/btrfs/inode.c | 2 +- 3 files changed, 14 insertions(+), 22 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 0a6cda4c30ed..4a0675ec90fa 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3110,8 +3110,7 @@ void set_page_extent_mapped(struct page *page) static struct extent_map * __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, - u64 start, u64 len, get_extent_t *get_extent, - struct extent_map **em_cached) + u64 start, u64 len, struct extent_map **em_cached) { struct extent_map *em; @@ -3127,7 +3126,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, *em_cached = NULL; } - em = get_extent(BTRFS_I(inode), page, pg_offset, start, len); + em = btrfs_get_extent(BTRFS_I(inode), page, pg_offset, start, len); if (em_cached && !IS_ERR_OR_NULL(em)) { BUG_ON(*em_cached); refcount_inc(&em->refs); @@ -3142,9 +3141,7 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, - get_extent_t *get_extent, - struct extent_map **em_cached, +static int __do_readpage(struct page *page, struct extent_map **em_cached, struct bio **bio, int mirror_num, unsigned long *bio_flags, unsigned int read_flags, u64 *prev_em_start) @@ -3209,7 +3206,7 @@ static int __do_readpage(struct page *page, break; } em = __get_extent_map(inode, page, pg_offset, cur, - end - cur + 1, get_extent, em_cached); + end - cur + 1, em_cached); if (IS_ERR_OR_NULL(em)) { SetPageError(page); unlock_extent(tree, cur, end); @@ -3362,16 +3359,14 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], btrfs_get_extent, em_cached, - bio, 0, bio_flags, REQ_RAHEAD, prev_em_start); + __do_readpage(pages[index], em_cached, bio, 0, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -static int __extent_read_full_page(struct page *page, - get_extent_t *get_extent, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, +static int __extent_read_full_page(struct page *page, struct bio **bio, + int mirror_num, unsigned long *bio_flags, unsigned int read_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); @@ -3381,20 +3376,18 @@ static int __extent_read_full_page(struct page *page, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, get_extent, NULL, bio, mirror_num, - bio_flags, read_flags, NULL); + ret = __do_readpage(page, NULL, bio, mirror_num, bio_flags, read_flags, + NULL); return ret; } -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num) +int extent_read_full_page(struct page *page, int mirror_num) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(page, get_extent, &bio, mirror_num, - &bio_flags, 0); + ret = __extent_read_full_page(page, &bio, mirror_num, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, mirror_num, bio_flags); return ret; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 06611947a9f7..272d5281bd4d 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -193,8 +193,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, get_extent_t *get_extent, - int mirror_num); +int extent_read_full_page(struct page *page, int mirror_num); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index ff9b22b71171..7db2cbb469e1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,7 +8040,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, btrfs_get_extent, 0); + return extent_read_full_page(page, 0); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From 72cffee4634058b993685669cab62c5d1c1fee78 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:07 +0300 Subject: [PATCH 233/271] btrfs: remove mirror_num argument from extent_read_full_page It's called only from btrfs_readpage which always passes 0 so just sink the argument into extent_read_full_page. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 +++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 4a0675ec90fa..355db40a1cb5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3381,15 +3381,15 @@ static int __extent_read_full_page(struct page *page, struct bio **bio, return ret; } -int extent_read_full_page(struct page *page, int mirror_num) +int extent_read_full_page(struct page *page) { struct bio *bio = NULL; unsigned long bio_flags = 0; int ret; - ret = __extent_read_full_page(page, &bio, mirror_num, &bio_flags, 0); + ret = __extent_read_full_page(page, &bio, 0, &bio_flags, 0); if (bio) - ret = submit_one_bio(bio, mirror_num, bio_flags); + ret = submit_one_bio(bio, 0, bio_flags); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 272d5281bd4d..0ccb2dabc291 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -193,7 +193,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page, int mirror_num); +int extent_read_full_page(struct page *page); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7db2cbb469e1..aaf998f3133c 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,7 +8040,7 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page, 0); + return extent_read_full_page(page); } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From c1be9c1ad5cca645ffd31284674ee4e28b9472de Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:08 +0300 Subject: [PATCH 234/271] btrfs: promote extent_read_full_page to btrfs_readpage Now that btrfs_readpage is the only caller of extent_read_full_page the latter can be open coded in the former. Use the occassion to rename __extent_read_full_page to extent_read_full_page. To facillitate this change submit_one_bio has to be exported as well. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 21 ++++----------------- fs/btrfs/extent_io.h | 5 ++++- fs/btrfs/inode.c | 9 ++++++++- 3 files changed, 16 insertions(+), 19 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 355db40a1cb5..402b88ddcbca 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -160,8 +160,8 @@ static int add_extent_changeset(struct extent_state *state, unsigned bits, return ret; } -static int __must_check submit_one_bio(struct bio *bio, int mirror_num, - unsigned long bio_flags) +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags) { blk_status_t ret = 0; struct extent_io_tree *tree = bio->bi_private; @@ -3365,9 +3365,8 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -static int __extent_read_full_page(struct page *page, struct bio **bio, - int mirror_num, unsigned long *bio_flags, - unsigned int read_flags) +int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, + unsigned long *bio_flags, unsigned int read_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); @@ -3381,18 +3380,6 @@ static int __extent_read_full_page(struct page *page, struct bio **bio, return ret; } -int extent_read_full_page(struct page *page) -{ - struct bio *bio = NULL; - unsigned long bio_flags = 0; - int ret; - - ret = __extent_read_full_page(page, &bio, 0, &bio_flags, 0); - if (bio) - ret = submit_one_bio(bio, 0, bio_flags); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 0ccb2dabc291..8fec00c50846 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -193,7 +193,10 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode, int try_release_extent_mapping(struct page *page, gfp_t mask); int try_release_extent_buffer(struct page *page); -int extent_read_full_page(struct page *page); +int __must_check submit_one_bio(struct bio *bio, int mirror_num, + unsigned long bio_flags); +int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, + unsigned long *bio_flags, unsigned int read_flags); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index aaf998f3133c..55c5617c180b 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,7 +8040,14 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - return extent_read_full_page(page); + struct bio *bio = NULL; + unsigned long bio_flags = 0; + int ret; + + ret = extent_read_full_page(page, &bio, 0, &bio_flags, 0); + if (bio) + ret = submit_one_bio(bio, 0, bio_flags); + return ret; } static int btrfs_writepage(struct page *page, struct writeback_control *wbc) From 003c286aef3f73e8e7b5fce6040761966ba65b5b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:09 +0300 Subject: [PATCH 235/271] btrfs: sink mirror_num argument in extent_read_full_page It's always set to 0 from the sole caller - btrfs_readpage. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 5 ++--- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 402b88ddcbca..f43827bee7e6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3365,7 +3365,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } } -int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, +int extent_read_full_page(struct page *page, struct bio **bio, unsigned long *bio_flags, unsigned int read_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); @@ -3375,8 +3375,7 @@ int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, NULL, bio, mirror_num, bio_flags, read_flags, - NULL); + ret = __do_readpage(page, NULL, bio, 0, bio_flags, read_flags, NULL); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 8fec00c50846..3562c9203de3 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -195,7 +195,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); -int extent_read_full_page(struct page *page, struct bio **bio, int mirror_num, +int extent_read_full_page(struct page *page, struct bio **bio, unsigned long *bio_flags, unsigned int read_flags); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 55c5617c180b..a08917e72f05 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8044,7 +8044,7 @@ int btrfs_readpage(struct file *file, struct page *page) unsigned long bio_flags = 0; int ret; - ret = extent_read_full_page(page, &bio, 0, &bio_flags, 0); + ret = extent_read_full_page(page, &bio, &bio_flags, 0); if (bio) ret = submit_one_bio(bio, 0, bio_flags); return ret; From 6f15af6060057babf03d3b0dce2b0d7a9258620c Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:10 +0300 Subject: [PATCH 236/271] btrfs: sink read_flags argument into extent_read_full_page It's always set to 0 by its sole caller - btrfs_readpage. Simply remove it. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 4 ++-- fs/btrfs/extent_io.h | 2 +- fs/btrfs/inode.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f43827bee7e6..1c959d66195c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3366,7 +3366,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, } int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags, unsigned int read_flags) + unsigned long *bio_flags) { struct btrfs_inode *inode = BTRFS_I(page->mapping->host); u64 start = page_offset(page); @@ -3375,7 +3375,7 @@ int extent_read_full_page(struct page *page, struct bio **bio, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, NULL, bio, 0, bio_flags, read_flags, NULL); + ret = __do_readpage(page, NULL, bio, 0, bio_flags, 0, NULL); return ret; } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3562c9203de3..5fa248570145 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -196,7 +196,7 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags, unsigned int read_flags); + unsigned long *bio_flags); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a08917e72f05..4b40d9703be6 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8044,7 +8044,7 @@ int btrfs_readpage(struct file *file, struct page *page) unsigned long bio_flags = 0; int ret; - ret = extent_read_full_page(page, &bio, &bio_flags, 0); + ret = extent_read_full_page(page, &bio, &bio_flags); if (bio) ret = submit_one_bio(bio, 0, bio_flags); return ret; From fd513000eb279347df1f5a7ce720ae66b4ac3b1b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 12:37:11 +0300 Subject: [PATCH 237/271] btrfs: sink mirror_num argument in __do_readpage It's always set to 0 by the 2 callers so move it inside __do_readpage. Reviewed-by: Josef Bacik Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1c959d66195c..1ef857c0d784 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3142,9 +3142,8 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * return 0 on success, otherwise return error */ static int __do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, int mirror_num, - unsigned long *bio_flags, unsigned int read_flags, - u64 *prev_em_start) + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3322,7 +3321,7 @@ static int __do_readpage(struct page *page, struct extent_map **em_cached, ret = submit_extent_page(REQ_OP_READ | read_flags, NULL, page, offset, disk_io_size, pg_offset, bio, - end_bio_extent_readpage, mirror_num, + end_bio_extent_readpage, 0, *bio_flags, this_bio_flag, force_bio_submit); @@ -3359,7 +3358,7 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], em_cached, bio, 0, bio_flags, + __do_readpage(pages[index], em_cached, bio, bio_flags, REQ_RAHEAD, prev_em_start); put_page(pages[index]); } @@ -3375,7 +3374,7 @@ int extent_read_full_page(struct page *page, struct bio **bio, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - ret = __do_readpage(page, NULL, bio, 0, bio_flags, 0, NULL); + ret = __do_readpage(page, NULL, bio, bio_flags, 0, NULL); return ret; } From 0f208812493f4ce17910633ad3f7f3b7c8c35cd3 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Mon, 14 Sep 2020 14:39:16 +0300 Subject: [PATCH 238/271] btrfs: open code extent_read_full_page to its sole caller This makes reading the code a tad easier by decreasing the level of indirection by one. Reviewed-by: Johannes Thumshirn Reviewed-by: Johannes Thumshirn Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 24 +++++------------------- fs/btrfs/extent_io.h | 5 +++-- fs/btrfs/inode.c | 9 +++++++-- 3 files changed, 15 insertions(+), 23 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1ef857c0d784..afac70ef0cc5 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3141,9 +3141,9 @@ __get_extent_map(struct inode *inode, struct page *page, size_t pg_offset, * XXX JDM: This needs looking at to ensure proper page locking * return 0 on success, otherwise return error */ -static int __do_readpage(struct page *page, struct extent_map **em_cached, - struct bio **bio, unsigned long *bio_flags, - unsigned int read_flags, u64 *prev_em_start) +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start) { struct inode *inode = page->mapping->host; u64 start = page_offset(page); @@ -3358,26 +3358,12 @@ static inline void contiguous_readpages(struct page *pages[], int nr_pages, btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); for (index = 0; index < nr_pages; index++) { - __do_readpage(pages[index], em_cached, bio, bio_flags, - REQ_RAHEAD, prev_em_start); + btrfs_do_readpage(pages[index], em_cached, bio, bio_flags, + REQ_RAHEAD, prev_em_start); put_page(pages[index]); } } -int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags) -{ - struct btrfs_inode *inode = BTRFS_I(page->mapping->host); - u64 start = page_offset(page); - u64 end = start + PAGE_SIZE - 1; - int ret; - - btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); - - ret = __do_readpage(page, NULL, bio, bio_flags, 0, NULL); - return ret; -} - static void update_nr_written(struct writeback_control *wbc, unsigned long nr_written) { diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 5fa248570145..3bbc25b816ea 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -195,8 +195,9 @@ int try_release_extent_buffer(struct page *page); int __must_check submit_one_bio(struct bio *bio, int mirror_num, unsigned long bio_flags); -int extent_read_full_page(struct page *page, struct bio **bio, - unsigned long *bio_flags); +int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, + struct bio **bio, unsigned long *bio_flags, + unsigned int read_flags, u64 *prev_em_start); int extent_write_full_page(struct page *page, struct writeback_control *wbc); int extent_write_locked_range(struct inode *inode, u64 start, u64 end, int mode); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 4b40d9703be6..c5b24a029eb3 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8040,11 +8040,16 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int btrfs_readpage(struct file *file, struct page *page) { - struct bio *bio = NULL; + struct btrfs_inode *inode = BTRFS_I(page->mapping->host); + u64 start = page_offset(page); + u64 end = start + PAGE_SIZE - 1; unsigned long bio_flags = 0; + struct bio *bio = NULL; int ret; - ret = extent_read_full_page(page, &bio, &bio_flags); + btrfs_lock_and_flush_ordered_range(inode, start, end, NULL); + + ret = btrfs_do_readpage(page, NULL, &bio, &bio_flags, 0, NULL); if (bio) ret = submit_one_bio(bio, 0, bio_flags); return ret; From 633cc816f742ef34b0dac76581d732a28ca8d62d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:49 +0300 Subject: [PATCH 239/271] btrfs: clean BTRFS_I usage in btrfs_destroy_inode Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 41 ++++++++++++++++++++--------------------- 1 file changed, 20 insertions(+), 21 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index c5b24a029eb3..d8cdc9829f84 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -8657,21 +8657,21 @@ void btrfs_free_inode(struct inode *inode) kmem_cache_free(btrfs_inode_cachep, BTRFS_I(inode)); } -void btrfs_destroy_inode(struct inode *inode) +void btrfs_destroy_inode(struct inode *vfs_inode) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_extent *ordered; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_inode *inode = BTRFS_I(vfs_inode); + struct btrfs_root *root = inode->root; - WARN_ON(!hlist_empty(&inode->i_dentry)); - WARN_ON(inode->i_data.nrpages); - WARN_ON(BTRFS_I(inode)->block_rsv.reserved); - WARN_ON(BTRFS_I(inode)->block_rsv.size); - WARN_ON(BTRFS_I(inode)->outstanding_extents); - WARN_ON(BTRFS_I(inode)->delalloc_bytes); - WARN_ON(BTRFS_I(inode)->new_delalloc_bytes); - WARN_ON(BTRFS_I(inode)->csum_bytes); - WARN_ON(BTRFS_I(inode)->defrag_bytes); + WARN_ON(!hlist_empty(&vfs_inode->i_dentry)); + WARN_ON(vfs_inode->i_data.nrpages); + WARN_ON(inode->block_rsv.reserved); + WARN_ON(inode->block_rsv.size); + WARN_ON(inode->outstanding_extents); + WARN_ON(inode->delalloc_bytes); + WARN_ON(inode->new_delalloc_bytes); + WARN_ON(inode->csum_bytes); + WARN_ON(inode->defrag_bytes); /* * This can happen where we create an inode, but somebody else also @@ -8682,24 +8682,23 @@ void btrfs_destroy_inode(struct inode *inode) return; while (1) { - ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode), - (u64)-1); + ordered = btrfs_lookup_first_ordered_extent(inode, (u64)-1); if (!ordered) break; else { - btrfs_err(fs_info, + btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); - btrfs_remove_ordered_extent(inode, ordered); + btrfs_remove_ordered_extent(vfs_inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } } - btrfs_qgroup_check_reserved_leak(BTRFS_I(inode)); - inode_tree_del(BTRFS_I(inode)); - btrfs_drop_extent_cache(BTRFS_I(inode), 0, (u64)-1, 0); - btrfs_inode_clear_file_extent_range(BTRFS_I(inode), 0, (u64)-1); - btrfs_put_root(BTRFS_I(inode)->root); + btrfs_qgroup_check_reserved_leak(inode); + inode_tree_del(inode); + btrfs_drop_extent_cache(inode, 0, (u64)-1, 0); + btrfs_inode_clear_file_extent_range(inode, 0, (u64)-1); + btrfs_put_root(inode->root); } int btrfs_drop_inode(struct inode *inode) From 71fe0a55dae7e5ef7762c5627891b37bf1b367bf Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:50 +0300 Subject: [PATCH 240/271] btrfs: switch btrfs_remove_ordered_extent to btrfs_inode Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 4 ++-- fs/btrfs/ordered-data.c | 7 +++---- fs/btrfs/ordered-data.h | 2 +- 3 files changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d8cdc9829f84..42d5e1c0a5d1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2753,7 +2753,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) * This needs to be done to make sure anybody waiting knows we are done * updating everything for this ordered extent. */ - btrfs_remove_ordered_extent(inode, ordered_extent); + btrfs_remove_ordered_extent(BTRFS_I(inode), ordered_extent); /* once for us */ btrfs_put_ordered_extent(ordered_extent); @@ -8689,7 +8689,7 @@ void btrfs_destroy_inode(struct inode *vfs_inode) btrfs_err(root->fs_info, "found ordered extent %llu %llu on inode cleanup", ordered->file_offset, ordered->num_bytes); - btrfs_remove_ordered_extent(vfs_inode, ordered); + btrfs_remove_ordered_extent(inode, ordered); btrfs_put_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c8a13d143877..c93b4f2c778b 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -463,13 +463,12 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) * remove an ordered extent from the tree. No references are dropped * and waiters are woken up. */ -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_ordered_inode_tree *tree; - struct btrfs_inode *btrfs_inode = BTRFS_I(inode); struct btrfs_root *root = btrfs_inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct rb_node *node; bool pending; @@ -527,7 +526,7 @@ void btrfs_remove_ordered_extent(struct inode *inode, list_del_init(&entry->root_extent_list); root->nr_ordered_extents--; - trace_btrfs_ordered_extent_remove(BTRFS_I(inode), entry); + trace_btrfs_ordered_extent_remove(btrfs_inode, entry); if (!root->nr_ordered_extents) { spin_lock(&fs_info->ordered_root_lock); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 6a1d5bf5aee3..8fe720da0b31 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -151,7 +151,7 @@ btrfs_ordered_inode_tree_init(struct btrfs_ordered_inode_tree *t) } void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry); -void btrfs_remove_ordered_extent(struct inode *inode, +void btrfs_remove_ordered_extent(struct btrfs_inode *btrfs_inode, struct btrfs_ordered_extent *entry); int btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, struct btrfs_ordered_extent **cached, From 3c38c877fcb9504b1db12cb24de9f514cedfae74 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:51 +0300 Subject: [PATCH 241/271] btrfs: sink inode argument in insert_ordered_extent_file_extent Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 42d5e1c0a5d1..eca42f95b0a8 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2549,7 +2549,6 @@ static void btrfs_release_delalloc_bytes(struct btrfs_fs_info *fs_info, } static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, - struct inode *inode, struct btrfs_ordered_extent *oe) { struct btrfs_file_extent_item stack_fi; @@ -2569,8 +2568,9 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type); /* Encryption and other encoding is reserved and all 0 */ - return insert_reserved_file_extent(trans, BTRFS_I(inode), oe->file_offset, - &stack_fi, oe->qgroup_rsv); + return insert_reserved_file_extent(trans, BTRFS_I(oe->inode), + oe->file_offset, &stack_fi, + oe->qgroup_rsv); } /* @@ -2667,8 +2667,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) logical_len); } else { BUG_ON(root == fs_info->tree_root); - ret = insert_ordered_extent_file_extent(trans, inode, - ordered_extent); + ret = insert_ordered_extent_file_extent(trans, ordered_extent); if (!ret) { clear_reserved_extent = false; btrfs_release_delalloc_bytes(fs_info, From 510f85edf1cdf1dbc937eed377442b2826bbb120 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:52 +0300 Subject: [PATCH 242/271] btrfs: remove inode argument from add_pending_csums It's used to reference the csum root which can be done from the trans handle as well. Simplify the signature and while at it also remove the noinline attribute as the function uses only at most 16 bytes of stack space. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/inode.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index eca42f95b0a8..0280eb31dd63 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2246,16 +2246,15 @@ static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, * given a list of ordered sums record them in the inode. This happens * at IO completion time based on sums calculated at bio submission time. */ -static noinline int add_pending_csums(struct btrfs_trans_handle *trans, - struct inode *inode, struct list_head *list) +static int add_pending_csums(struct btrfs_trans_handle *trans, + struct list_head *list) { struct btrfs_ordered_sum *sum; int ret; list_for_each_entry(sum, list, list) { trans->adding_csums = true; - ret = btrfs_csum_file_blocks(trans, - BTRFS_I(inode)->root->fs_info->csum_root, sum); + ret = btrfs_csum_file_blocks(trans, trans->fs_info->csum_root, sum); trans->adding_csums = false; if (ret) return ret; @@ -2683,7 +2682,7 @@ static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = add_pending_csums(trans, inode, &ordered_extent->list); + ret = add_pending_csums(trans, &ordered_extent->list); if (ret) { btrfs_abort_transaction(trans, ret); goto out; From c0a43603056cb79be8a0dffeedc544ce9b5d115d Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 12:15:53 +0300 Subject: [PATCH 243/271] btrfs: remove inode argument from btrfs_start_ordered_extent The passed in ordered_extent struct is always well-formed and contains the inode making the explicit argument redundant. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/file.c | 3 +-- fs/btrfs/inode.c | 8 ++++---- fs/btrfs/ioctl.c | 2 +- fs/btrfs/ordered-data.c | 15 +++++++-------- fs/btrfs/ordered-data.h | 3 +-- 5 files changed, 14 insertions(+), 17 deletions(-) diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 2969d715c588..038e0afaf3d0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -1491,8 +1491,7 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages, unlock_page(pages[i]); put_page(pages[i]); } - btrfs_start_ordered_extent(&inode->vfs_inode, - ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); return -EAGAIN; } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 0280eb31dd63..b8481632dfc7 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2357,7 +2357,7 @@ static void btrfs_writepage_fixup_worker(struct btrfs_work *work) unlock_extent_cached(&inode->io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -4581,7 +4581,7 @@ int btrfs_truncate_block(struct inode *inode, loff_t from, loff_t len, &cached_state); unlock_page(page); put_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } @@ -7149,7 +7149,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, */ if (writing || test_bit(BTRFS_ORDERED_DIRECT, &ordered->flags)) - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); else ret = -ENOTBLK; btrfs_put_ordered_extent(ordered); @@ -8324,7 +8324,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index de65cf6105e4..ab408a23ba32 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -1319,7 +1319,7 @@ static int cluster_pages_for_defrag(struct inode *inode, break; unlock_page(page); - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); lock_page(page); /* diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index c93b4f2c778b..87bac9ecdf4c 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -543,7 +543,7 @@ static void btrfs_run_ordered_extent_work(struct btrfs_work *work) struct btrfs_ordered_extent *ordered; ordered = container_of(work, struct btrfs_ordered_extent, flush_work); - btrfs_start_ordered_extent(ordered->inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); complete(&ordered->completion); } @@ -649,14 +649,13 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, * in the extent, and it waits on the io completion code to insert * metadata into the btree corresponding to the extent */ -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, - int wait) +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; + struct btrfs_inode *inode = BTRFS_I(entry->inode); - trace_btrfs_ordered_extent_start(BTRFS_I(inode), entry); + trace_btrfs_ordered_extent_start(inode, entry); /* * pages in the range can be dirty, clean or writeback. We @@ -664,7 +663,7 @@ void btrfs_start_ordered_extent(struct inode *inode, * for the flusher thread to find them */ if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->i_mapping, start, end); + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); if (wait) { wait_event(entry->wait, test_bit(BTRFS_ORDERED_COMPLETE, &entry->flags)); @@ -719,7 +718,7 @@ int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len) btrfs_put_ordered_extent(ordered); break; } - btrfs_start_ordered_extent(inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); end = ordered->file_offset; /* * If the ordered extent had an error save the error but don't @@ -939,7 +938,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, break; } unlock_extent_cached(&inode->io_tree, start, end, cachedp); - btrfs_start_ordered_extent(&inode->vfs_inode, ordered, 1); + btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); } } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 8fe720da0b31..c3a2325e64a4 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -174,8 +174,7 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct inode *inode, - struct btrfs_ordered_extent *entry, int wait); +void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry, int wait); int btrfs_wait_ordered_range(struct inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); From 8eb2fd00153a3a96a19c62ac9c6d48c2efebe5e8 Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 21 Sep 2020 20:03:35 +0300 Subject: [PATCH 244/271] btrfs: use kvzalloc() to allocate clone_roots in btrfs_ioctl_send() btrfs_ioctl_send() used open-coded kvzalloc implementation earlier. The code was accidentally replaced with kzalloc() call [1]. Restore the original code by using kvzalloc() to allocate sctx->clone_roots. [1] https://patchwork.kernel.org/patch/9757891/#20529627 Fixes: 818e010bf9d0 ("btrfs: replace opencoded kvzalloc with the helper") CC: stable@vger.kernel.org # 4.14+ Signed-off-by: Denis Efremov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index ac0e942e8d7c..79b7d15ca50f 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7149,7 +7149,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - sctx->clone_roots = kzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; From bae12df966f0e1a9b40a2c46d01a0ad79b2c865c Mon Sep 17 00:00:00 2001 From: Denis Efremov Date: Mon, 21 Sep 2020 20:03:36 +0300 Subject: [PATCH 245/271] btrfs: use kvcalloc for allocation in btrfs_ioctl_send() Replace kvzalloc() call with kvcalloc() that also checks the size internally. There's a standalone overflow check in the function so we can return invalid parameter combination. Use array_size() helper to compute the memory size for clone_sources_tmp. Cc: Kees Cook Signed-off-by: Denis Efremov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/send.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 79b7d15ca50f..b84f921ed6c0 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7061,7 +7061,7 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) u32 i; u64 *clone_sources_tmp = NULL; int clone_sources_to_rollback = 0; - unsigned alloc_size; + size_t alloc_size; int sort_clone_roots = 0; if (!capable(CAP_SYS_ADMIN)) @@ -7147,15 +7147,16 @@ long btrfs_ioctl_send(struct file *mnt_file, struct btrfs_ioctl_send_args *arg) sctx->waiting_dir_moves = RB_ROOT; sctx->orphan_dirs = RB_ROOT; - alloc_size = sizeof(struct clone_root) * (arg->clone_sources_count + 1); - - sctx->clone_roots = kvzalloc(alloc_size, GFP_KERNEL); + sctx->clone_roots = kvcalloc(sizeof(*sctx->clone_roots), + arg->clone_sources_count + 1, + GFP_KERNEL); if (!sctx->clone_roots) { ret = -ENOMEM; goto out; } - alloc_size = arg->clone_sources_count * sizeof(*arg->clone_sources); + alloc_size = array_size(sizeof(*arg->clone_sources), + arg->clone_sources_count); if (arg->clone_sources_count) { clone_sources_tmp = kvmalloc(alloc_size, GFP_KERNEL); From 6b613cc97f0ace77f92f7bc112b8f6ad3f52baf8 Mon Sep 17 00:00:00 2001 From: Johannes Thumshirn Date: Tue, 22 Sep 2020 17:27:29 +0900 Subject: [PATCH 246/271] btrfs: reschedule when cloning lots of extents We have several occurrences of a soft lockup from fstest's generic/175 testcase, which look more or less like this one: watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [xfs_io:10030] Kernel panic - not syncing: softlockup: hung tasks CPU: 0 PID: 10030 Comm: xfs_io Tainted: G L 5.9.0-rc5+ #768 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.13.0-0-gf21b5a4-rebuilt.opensuse.org 04/01/2014 Call Trace: dump_stack+0x77/0xa0 panic+0xfa/0x2cb watchdog_timer_fn.cold+0x85/0xa5 ? lockup_detector_update_enable+0x50/0x50 __hrtimer_run_queues+0x99/0x4c0 ? recalibrate_cpu_khz+0x10/0x10 hrtimer_run_queues+0x9f/0xb0 update_process_times+0x28/0x80 tick_handle_periodic+0x1b/0x60 __sysvec_apic_timer_interrupt+0x76/0x210 asm_call_on_stack+0x12/0x20 sysvec_apic_timer_interrupt+0x7f/0x90 asm_sysvec_apic_timer_interrupt+0x12/0x20 RIP: 0010:btrfs_tree_unlock+0x91/0x1a0 [btrfs] RSP: 0018:ffffc90007123a58 EFLAGS: 00000282 RAX: ffff8881cea2fbe0 RBX: ffff8881cea2fbe0 RCX: 0000000000000000 RDX: ffff8881d23fd200 RSI: ffffffff82045220 RDI: ffff8881cea2fba0 RBP: 0000000000000001 R08: 0000000000000000 R09: 0000000000000032 R10: 0000160000000000 R11: 0000000000001000 R12: 0000000000001000 R13: ffff8882357fd5b0 R14: ffff88816fa76e70 R15: ffff8881cea2fad0 ? btrfs_tree_unlock+0x15b/0x1a0 [btrfs] btrfs_release_path+0x67/0x80 [btrfs] btrfs_insert_replace_extent+0x177/0x2c0 [btrfs] btrfs_replace_file_extents+0x472/0x7c0 [btrfs] btrfs_clone+0x9ba/0xbd0 [btrfs] btrfs_clone_files.isra.0+0xeb/0x140 [btrfs] ? file_update_time+0xcd/0x120 btrfs_remap_file_range+0x322/0x3b0 [btrfs] do_clone_file_range+0xb7/0x1e0 vfs_clone_file_range+0x30/0xa0 ioctl_file_clone+0x8a/0xc0 do_vfs_ioctl+0x5b2/0x6f0 __x64_sys_ioctl+0x37/0xa0 do_syscall_64+0x33/0x40 entry_SYSCALL_64_after_hwframe+0x44/0xa9 RIP: 0033:0x7f87977fc247 RSP: 002b:00007ffd51a2f6d8 EFLAGS: 00000206 ORIG_RAX: 0000000000000010 RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f87977fc247 RDX: 00007ffd51a2f710 RSI: 000000004020940d RDI: 0000000000000003 RBP: 0000000000000004 R08: 00007ffd51a79080 R09: 0000000000000000 R10: 00005621f11352f2 R11: 0000000000000206 R12: 0000000000000000 R13: 0000000000000000 R14: 00005621f128b958 R15: 0000000080000000 Kernel Offset: disabled ---[ end Kernel panic - not syncing: softlockup: hung tasks ]--- All of these lockup reports have the call chain btrfs_clone_files() -> btrfs_clone() in common. btrfs_clone_files() calls btrfs_clone() with both source and destination extents locked and loops over the source extent to create the clones. Conditionally reschedule in the btrfs_clone() loop, to give some time back to other processes. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Johannes Thumshirn Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/reflink.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 39b3269e5760..99aa87c08912 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -520,6 +520,8 @@ static int btrfs_clone(struct inode *src, struct inode *inode, ret = -EINTR; goto out; } + + cond_resched(); } ret = 0; From 2c53a14dd30124de9468316933715deaf7443096 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 15 Sep 2020 13:35:27 +0800 Subject: [PATCH 247/271] btrfs: use own btree inode io_tree owner id Btree inode is special compared to all other inode extent io_trees, although it has a btrfs inode, it doesn't have the track_uptodate bit at all. This means a lot of things like extent locking doesn't even need to be applied to btree io tree. Since it's so special, adds a new owner value for it to make debuging a little easier. Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 2 +- fs/btrfs/extent-io-tree.h | 1 + include/trace/events/btrfs.h | 1 + 3 files changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 53b588a7e150..ebc110231f30 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -2062,7 +2062,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node); extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_INODE_IO, inode); + IO_TREE_BTREE_INODE_IO, inode); BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 250b8cbaaf97..53e7b5b5d6ad 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -40,6 +40,7 @@ struct io_failure_record; enum { IO_TREE_FS_PINNED_EXTENTS, IO_TREE_FS_EXCLUDED_EXTENTS, + IO_TREE_BTREE_INODE_IO, IO_TREE_INODE_IO, IO_TREE_INODE_IO_FAILURE, IO_TREE_RELOC_BLOCKS, diff --git a/include/trace/events/btrfs.h b/include/trace/events/btrfs.h index 8d311062d376..ecd24c719de4 100644 --- a/include/trace/events/btrfs.h +++ b/include/trace/events/btrfs.h @@ -79,6 +79,7 @@ struct btrfs_space_info; #define IO_TREE_OWNER \ EM( IO_TREE_FS_PINNED_EXTENTS, "PINNED_EXTENTS") \ EM( IO_TREE_FS_EXCLUDED_EXTENTS, "EXCLUDED_EXTENTS") \ + EM( IO_TREE_BTREE_INODE_IO, "BTREE_INODE_IO") \ EM( IO_TREE_INODE_IO, "INODE_IO") \ EM( IO_TREE_INODE_IO_FAILURE, "INODE_IO_FAILURE") \ EM( IO_TREE_RELOC_BLOCKS, "RELOC_BLOCKS") \ From e2f896b3180e3b5d34b1d174b931205a43072d17 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 10:54:23 +0200 Subject: [PATCH 248/271] btrfs: send: use helpers for unaligned access to header members The header is mapped onto the send buffer and thus its members may be potentially unaligned so use the helpers instead of directly assigning the pointers. This has worked so far but let's use the helpers to make that clear. Signed-off-by: David Sterba --- fs/btrfs/send.c | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index b84f921ed6c0..9f1ee52482c9 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -577,8 +577,8 @@ static int tlv_put(struct send_ctx *sctx, u16 attr, const void *data, int len) return -EOVERFLOW; hdr = (struct btrfs_tlv_header *) (sctx->send_buf + sctx->send_size); - hdr->tlv_type = cpu_to_le16(attr); - hdr->tlv_len = cpu_to_le16(len); + put_unaligned_le16(attr, &hdr->tlv_type); + put_unaligned_le16(len, &hdr->tlv_len); memcpy(hdr + 1, data, len); sctx->send_size += total_len; @@ -688,7 +688,7 @@ static int begin_cmd(struct send_ctx *sctx, int cmd) sctx->send_size += sizeof(*hdr); hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->cmd = cpu_to_le16(cmd); + put_unaligned_le16(cmd, &hdr->cmd); return 0; } @@ -700,17 +700,17 @@ static int send_cmd(struct send_ctx *sctx) u32 crc; hdr = (struct btrfs_cmd_header *)sctx->send_buf; - hdr->len = cpu_to_le32(sctx->send_size - sizeof(*hdr)); - hdr->crc = 0; + put_unaligned_le32(sctx->send_size - sizeof(*hdr), &hdr->len); + put_unaligned_le32(0, &hdr->crc); crc = btrfs_crc32c(0, (unsigned char *)sctx->send_buf, sctx->send_size); - hdr->crc = cpu_to_le32(crc); + put_unaligned_le32(crc, &hdr->crc); ret = write_buf(sctx->send_filp, sctx->send_buf, sctx->send_size, &sctx->send_off); sctx->total_send_size += sctx->send_size; - sctx->cmd_send_size[le16_to_cpu(hdr->cmd)] += sctx->send_size; + sctx->cmd_send_size[get_unaligned_le16(&hdr->cmd)] += sctx->send_size; sctx->send_size = 0; return ret; From 6994ca367ce5160dea2718e0993542d148fa68f8 Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 13:32:34 +0200 Subject: [PATCH 249/271] btrfs: free-space-cache: use unaligned helpers to access data The free space inode stores the tracking data, checksums etc, using the io_ctl structure and moving the pointers. The data are generally aligned to at least 4 bytes (u32 for CRC) so it's not completely unaligned but for clarity we should use the proper helpers whenever a struct is initialized from io_ctl->cur pointer. Signed-off-by: David Sterba --- fs/btrfs/free-space-cache.c | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 8759f5a1d6a0..af0013d3df63 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -413,8 +413,6 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *val; - io_ctl_map_page(io_ctl, 1); /* @@ -429,14 +427,13 @@ static void io_ctl_set_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - val = io_ctl->cur; - *val = cpu_to_le64(generation); + put_unaligned_le64(generation, io_ctl->cur); io_ctl->cur += sizeof(u64); } static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) { - __le64 *gen; + u64 cache_gen; /* * Skip the crc area. If we don't check crcs then we just have a 64bit @@ -451,11 +448,11 @@ static int io_ctl_check_generation(struct btrfs_io_ctl *io_ctl, u64 generation) io_ctl->size -= sizeof(u64) * 2; } - gen = io_ctl->cur; - if (le64_to_cpu(*gen) != generation) { + cache_gen = get_unaligned_le64(io_ctl->cur); + if (cache_gen != generation) { btrfs_err_rl(io_ctl->fs_info, "space cache generation (%llu) does not match inode (%llu)", - *gen, generation); + cache_gen, generation); io_ctl_unmap_page(io_ctl); return -EIO; } @@ -525,8 +522,8 @@ static int io_ctl_add_entry(struct btrfs_io_ctl *io_ctl, u64 offset, u64 bytes, return -ENOSPC; entry = io_ctl->cur; - entry->offset = cpu_to_le64(offset); - entry->bytes = cpu_to_le64(bytes); + put_unaligned_le64(offset, &entry->offset); + put_unaligned_le64(bytes, &entry->bytes); entry->type = (bitmap) ? BTRFS_FREE_SPACE_BITMAP : BTRFS_FREE_SPACE_EXTENT; io_ctl->cur += sizeof(struct btrfs_free_space_entry); @@ -599,8 +596,8 @@ static int io_ctl_read_entry(struct btrfs_io_ctl *io_ctl, } e = io_ctl->cur; - entry->offset = le64_to_cpu(e->offset); - entry->bytes = le64_to_cpu(e->bytes); + entry->offset = get_unaligned_le64(&e->offset); + entry->bytes = get_unaligned_le64(&e->bytes); *type = e->type; io_ctl->cur += sizeof(struct btrfs_free_space_entry); io_ctl->size -= sizeof(struct btrfs_free_space_entry); From e97659cefe1e9dc0bb4284195ba4ab380d0cf1fe Mon Sep 17 00:00:00 2001 From: David Sterba Date: Tue, 15 Sep 2020 14:58:42 +0200 Subject: [PATCH 250/271] btrfs: use unaligned helpers for stack and header set/get helpers In the definitions generated by BTRFS_SETGET_HEADER_FUNCS there's direct pointer assignment but we should use the helpers for unaligned access for clarity. It hasn't been a problem so far because of the natural alignment. Similarly for BTRFS_SETGET_STACK_FUNCS, that usually get a structure from stack that has an aligned start but some members may not be aligned due to packing. This as well hasn't caused problems so far. Move the put/get_unaligned_le8 stubs to ctree.h so we can use them. Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 20 ++++++++++++++------ fs/btrfs/struct-funcs.c | 10 ---------- 2 files changed, 14 insertions(+), 16 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index cd644c755142..5cb8af6af25d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1420,6 +1420,16 @@ static inline void btrfs_init_map_token(struct btrfs_map_token *token, #define cpu_to_le8(v) (v) #define __le8 u8 +static inline u8 get_unaligned_le8(const void *p) +{ + return *(u8 *)p; +} + +static inline void put_unaligned_le8(u8 val, void *p) +{ + *(u8 *)p = val; +} + #define read_eb_member(eb, ptr, type, member, result) (\ read_extent_buffer(eb, (char *)(result), \ ((unsigned long)(ptr)) + \ @@ -1478,27 +1488,25 @@ static inline void btrfs_set_token_##name(struct btrfs_map_token *token,\ static inline u##bits btrfs_##name(const struct extent_buffer *eb) \ { \ const type *p = page_address(eb->pages[0]); \ - u##bits res = le##bits##_to_cpu(p->member); \ - return res; \ + return get_unaligned_le##bits(&p->member); \ } \ static inline void btrfs_set_##name(const struct extent_buffer *eb, \ u##bits val) \ { \ type *p = page_address(eb->pages[0]); \ - p->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &p->member); \ } #define BTRFS_SETGET_STACK_FUNCS(name, type, member, bits) \ static inline u##bits btrfs_##name(const type *s) \ { \ - return le##bits##_to_cpu(s->member); \ + return get_unaligned_le##bits(&s->member); \ } \ static inline void btrfs_set_##name(type *s, u##bits val) \ { \ - s->member = cpu_to_le##bits(val); \ + put_unaligned_le##bits(val, &s->member); \ } - static inline u64 btrfs_device_total_bytes(const struct extent_buffer *eb, struct btrfs_dev_item *s) { diff --git a/fs/btrfs/struct-funcs.c b/fs/btrfs/struct-funcs.c index 079b059818e9..c46be27be700 100644 --- a/fs/btrfs/struct-funcs.c +++ b/fs/btrfs/struct-funcs.c @@ -7,16 +7,6 @@ #include "ctree.h" -static inline u8 get_unaligned_le8(const void *p) -{ - return *(u8 *)p; -} - -static inline void put_unaligned_le8(u8 val, void *p) -{ - *(u8 *)p = val; -} - static bool check_setget_bounds(const struct extent_buffer *eb, const void *ptr, unsigned off, int size) { From 1465af12e254a68706e110846f59cf0f09683184 Mon Sep 17 00:00:00 2001 From: Qu Wenruo Date: Tue, 22 Sep 2020 10:37:01 +0800 Subject: [PATCH 251/271] btrfs: tree-checker: fix false alert caused by legacy btrfs root item Commit 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") introduced btrfs root item size check, however btrfs root item has two versions, the legacy one which just ends before generation_v2 member, is smaller than current btrfs root item size. This caused btrfs kernel to reject valid but old tree root leaves. Fix this problem by also allowing legacy root item, since kernel can already handle them pretty well and upgrade to newer root item format when needed. Reported-by: Martin Steigerwald Fixes: 259ee7754b67 ("btrfs: tree-checker: Add ROOT_ITEM check") CC: stable@vger.kernel.org # 5.4+ Tested-By: Martin Steigerwald Reviewed-by: Josef Bacik Signed-off-by: Qu Wenruo Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/tree-checker.c | 17 ++++++++++++----- include/uapi/linux/btrfs_tree.h | 14 ++++++++++++++ 2 files changed, 26 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 7b1fee630f97..f0ffd5ee77bd 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1035,7 +1035,7 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, int slot) { struct btrfs_fs_info *fs_info = leaf->fs_info; - struct btrfs_root_item ri; + struct btrfs_root_item ri = { 0 }; const u64 valid_root_flags = BTRFS_ROOT_SUBVOL_RDONLY | BTRFS_ROOT_SUBVOL_DEAD; int ret; @@ -1044,14 +1044,21 @@ static int check_root_item(struct extent_buffer *leaf, struct btrfs_key *key, if (ret < 0) return ret; - if (btrfs_item_size_nr(leaf, slot) != sizeof(ri)) { + if (btrfs_item_size_nr(leaf, slot) != sizeof(ri) && + btrfs_item_size_nr(leaf, slot) != btrfs_legacy_root_item_size()) { generic_err(leaf, slot, - "invalid root item size, have %u expect %zu", - btrfs_item_size_nr(leaf, slot), sizeof(ri)); + "invalid root item size, have %u expect %zu or %u", + btrfs_item_size_nr(leaf, slot), sizeof(ri), + btrfs_legacy_root_item_size()); } + /* + * For legacy root item, the members starting at generation_v2 will be + * all filled with 0. + * And since we allow geneartion_v2 as 0, it will still pass the check. + */ read_extent_buffer(leaf, &ri, btrfs_item_ptr_offset(leaf, slot), - sizeof(ri)); + btrfs_item_size_nr(leaf, slot)); /* Generation related */ if (btrfs_root_generation(&ri) > diff --git a/include/uapi/linux/btrfs_tree.h b/include/uapi/linux/btrfs_tree.h index 9ba64ca6b4ac..6b885982ece6 100644 --- a/include/uapi/linux/btrfs_tree.h +++ b/include/uapi/linux/btrfs_tree.h @@ -4,6 +4,11 @@ #include #include +#ifdef __KERNEL__ +#include +#else +#include +#endif /* * This header contains the structure definitions and constants used @@ -644,6 +649,15 @@ struct btrfs_root_item { __le64 reserved[8]; /* for future */ } __attribute__ ((__packed__)); +/* + * Btrfs root item used to be smaller than current size. The old format ends + * at where member generation_v2 is. + */ +static inline __u32 btrfs_legacy_root_item_size(void) +{ + return offsetof(struct btrfs_root_item, generation_v2); +} + /* * this is used for both forward and backward root refs */ From 98272bb77bf4cc20ed1ffca89832d713e70ebf09 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Sep 2020 14:13:29 +0100 Subject: [PATCH 252/271] btrfs: send, orphanize first all conflicting inodes when processing references When doing an incremental send it is possible that when processing the new references for an inode we end up issuing rename or link operations that have an invalid path, which contains the orphanized name of a directory before we actually orphanized it, causing the receiver to fail. The following reproducer triggers such scenario: $ cat reproducer.sh #!/bin/bash mkfs.btrfs -f /dev/sdi >/dev/null mount /dev/sdi /mnt/sdi touch /mnt/sdi/a touch /mnt/sdi/b mkdir /mnt/sdi/testdir # We want "a" to have a lower inode number then "testdir" (257 vs 259). mv /mnt/sdi/a /mnt/sdi/testdir/a # Filesystem looks like: # # . (ino 256) # |----- testdir/ (ino 259) # | |----- a (ino 257) # | # |----- b (ino 258) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1 btrfs send -f /tmp/snap1.send /mnt/sdi/snap1 # Now rename 259 to "testdir_2", then change the name of 257 to # "testdir" and make it a direct descendant of the root inode (256). # Also create a new link for inode 257 with the old name of inode 258. # By swapping the names and location of several inodes and create a # nasty dependency chain of rename and link operations. mv /mnt/sdi/testdir/a /mnt/sdi/a2 touch /mnt/sdi/testdir/a mv /mnt/sdi/b /mnt/sdi/b2 ln /mnt/sdi/a2 /mnt/sdi/b mv /mnt/sdi/testdir /mnt/sdi/testdir_2 mv /mnt/sdi/a2 /mnt/sdi/testdir # Filesystem now looks like: # # . (ino 256) # |----- testdir_2/ (ino 259) # | |----- a (ino 260) # | # |----- testdir (ino 257) # |----- b (ino 257) # |----- b2 (ino 258) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2 btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2 mkfs.btrfs -f /dev/sdj >/dev/null mount /dev/sdj /mnt/sdj btrfs receive -f /tmp/snap1.send /mnt/sdj btrfs receive -f /tmp/snap2.send /mnt/sdj umount /mnt/sdi umount /mnt/sdj When running the reproducer, the receive of the incremental send stream fails: $ ./reproducer.sh Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' At subvol /mnt/sdi/snap1 Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' At subvol /mnt/sdi/snap2 At subvol snap1 At snapshot snap2 ERROR: link b -> o259-6-0/a failed: No such file or directory The problem happens because of the following: 1) Before we start iterating the list of new references for inode 257, we generate its current path and store it at @valid_path, done at the very beginning of process_recorded_refs(). The generated path is "o259-6-0/a", containing the orphanized name for inode 259; 2) Then we iterate over the list of new references, which has the references "b" and "testdir" in that specific order; 3) We process reference "b" first, because it is in the list before reference "testdir". We then issue a link operation to create the new reference "b" using a target path corresponding to the content at @valid_path, which corresponds to "o259-6-0/a". However we haven't yet orphanized inode 259, its name is still "testdir", and not "o259-6-0". The orphanization of 259 did not happen yet because we will process the reference named "testdir" for inode 257 only in the next iteration of the loop that goes over the list of new references. Fix the issue by having a preliminar iteration over all the new references at process_recorded_refs(). This iteration is responsible only for doing the orphanization of other inodes that have and old reference that conflicts with one of the new references of the inode we are currently processing. The emission of rename and link operations happen now in the next iteration of the new references. A test case for fstests will follow soon. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 127 +++++++++++++++++++++++++++++++++--------------- 1 file changed, 87 insertions(+), 40 deletions(-) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 9f1ee52482c9..f9c14c33e753 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3873,52 +3873,56 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) goto out; } + /* + * Before doing any rename and link operations, do a first pass on the + * new references to orphanize any unprocessed inodes that may have a + * reference that conflicts with one of the new references of the current + * inode. This needs to happen first because a new reference may conflict + * with the old reference of a parent directory, so we must make sure + * that the path used for link and rename commands don't use an + * orphanized name when an ancestor was not yet orphanized. + * + * Example: + * + * Parent snapshot: + * + * . (ino 256) + * |----- testdir/ (ino 259) + * | |----- a (ino 257) + * | + * |----- b (ino 258) + * + * Send snapshot: + * + * . (ino 256) + * |----- testdir_2/ (ino 259) + * | |----- a (ino 260) + * | + * |----- testdir (ino 257) + * |----- b (ino 257) + * |----- b2 (ino 258) + * + * Processing the new reference for inode 257 with name "b" may happen + * before processing the new reference with name "testdir". If so, we + * must make sure that by the time we send a link command to create the + * hard link "b", inode 259 was already orphanized, since the generated + * path in "valid_path" already contains the orphanized name for 259. + * We are processing inode 257, so only later when processing 259 we do + * the rename operation to change its temporary (orphanized) name to + * "testdir_2". + */ list_for_each_entry(cur, &sctx->new_refs, list) { - /* - * We may have refs where the parent directory does not exist - * yet. This happens if the parent directories inum is higher - * than the current inum. To handle this case, we create the - * parent directory out of order. But we need to check if this - * did already happen before due to other refs in the same dir. - */ ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); if (ret < 0) goto out; - if (ret == inode_state_will_create) { - ret = 0; - /* - * First check if any of the current inodes refs did - * already create the dir. - */ - list_for_each_entry(cur2, &sctx->new_refs, list) { - if (cur == cur2) - break; - if (cur2->dir == cur->dir) { - ret = 1; - break; - } - } - - /* - * If that did not happen, check if a previous inode - * did already create the dir. - */ - if (!ret) - ret = did_create_dir(sctx, cur->dir); - if (ret < 0) - goto out; - if (!ret) { - ret = send_create_inode(sctx, cur->dir); - if (ret < 0) - goto out; - } - } + if (ret == inode_state_will_create) + continue; /* - * Check if this new ref would overwrite the first ref of - * another unprocessed inode. If yes, orphanize the - * overwritten inode. If we find an overwritten ref that is - * not the first ref, simply unlink it. + * Check if this new ref would overwrite the first ref of another + * unprocessed inode. If yes, orphanize the overwritten inode. + * If we find an overwritten ref that is not the first ref, + * simply unlink it. */ ret = will_overwrite_ref(sctx, cur->dir, cur->dir_gen, cur->name, cur->name_len, @@ -3997,6 +4001,49 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) } } + } + + list_for_each_entry(cur, &sctx->new_refs, list) { + /* + * We may have refs where the parent directory does not exist + * yet. This happens if the parent directories inum is higher + * than the current inum. To handle this case, we create the + * parent directory out of order. But we need to check if this + * did already happen before due to other refs in the same dir. + */ + ret = get_cur_inode_state(sctx, cur->dir, cur->dir_gen); + if (ret < 0) + goto out; + if (ret == inode_state_will_create) { + ret = 0; + /* + * First check if any of the current inodes refs did + * already create the dir. + */ + list_for_each_entry(cur2, &sctx->new_refs, list) { + if (cur == cur2) + break; + if (cur2->dir == cur->dir) { + ret = 1; + break; + } + } + + /* + * If that did not happen, check if a previous inode + * did already create the dir. + */ + if (!ret) + ret = did_create_dir(sctx, cur->dir); + if (ret < 0) + goto out; + if (!ret) { + ret = send_create_inode(sctx, cur->dir); + if (ret < 0) + goto out; + } + } + if (S_ISDIR(sctx->cur_inode_mode) && sctx->parent_root) { ret = wait_for_dest_dir_move(sctx, cur, is_orphan); if (ret < 0) From 9c2b4e0347067396ceb3ae929d6888c81d610259 Mon Sep 17 00:00:00 2001 From: Filipe Manana Date: Mon, 21 Sep 2020 14:13:30 +0100 Subject: [PATCH 253/271] btrfs: send, recompute reference path after orphanization of a directory During an incremental send, when an inode has multiple new references we might end up emitting rename operations for orphanizations that have a source path that is no longer valid due to a previous orphanization of some directory inode. This causes the receiver to fail since it tries to rename a path that does not exists. Example reproducer: $ cat reproducer.sh #!/bin/bash mkfs.btrfs -f /dev/sdi >/dev/null mount /dev/sdi /mnt/sdi touch /mnt/sdi/f1 touch /mnt/sdi/f2 mkdir /mnt/sdi/d1 mkdir /mnt/sdi/d1/d2 # Filesystem looks like: # # . (ino 256) # |----- f1 (ino 257) # |----- f2 (ino 258) # |----- d1/ (ino 259) # |----- d2/ (ino 260) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap1 btrfs send -f /tmp/snap1.send /mnt/sdi/snap1 # Now do a series of changes such that: # # *) inode 258 has one new hardlink and the previous name changed # # *) both names conflict with the old names of two other inodes: # # 1) the new name "d1" conflicts with the old name of inode 259, # under directory inode 256 (root) # # 2) the new name "d2" conflicts with the old name of inode 260 # under directory inode 259 # # *) inodes 259 and 260 now have the old names of inode 258 # # *) inode 257 is now located under inode 260 - an inode with a number # smaller than the inode (258) for which we created a second hard # link and swapped its names with inodes 259 and 260 # ln /mnt/sdi/f2 /mnt/sdi/d1/f2_link mv /mnt/sdi/f1 /mnt/sdi/d1/d2/f1 # Swap d1 and f2. mv /mnt/sdi/d1 /mnt/sdi/tmp mv /mnt/sdi/f2 /mnt/sdi/d1 mv /mnt/sdi/tmp /mnt/sdi/f2 # Swap d2 and f2_link mv /mnt/sdi/f2/d2 /mnt/sdi/tmp mv /mnt/sdi/f2/f2_link /mnt/sdi/f2/d2 mv /mnt/sdi/tmp /mnt/sdi/f2/f2_link # Filesystem now looks like: # # . (ino 256) # |----- d1 (ino 258) # |----- f2/ (ino 259) # |----- f2_link/ (ino 260) # | |----- f1 (ino 257) # | # |----- d2 (ino 258) btrfs subvolume snapshot -r /mnt/sdi /mnt/sdi/snap2 btrfs send -f /tmp/snap2.send -p /mnt/sdi/snap1 /mnt/sdi/snap2 mkfs.btrfs -f /dev/sdj >/dev/null mount /dev/sdj /mnt/sdj btrfs receive -f /tmp/snap1.send /mnt/sdj btrfs receive -f /tmp/snap2.send /mnt/sdj umount /mnt/sdi umount /mnt/sdj When executed the receive of the incremental stream fails: $ ./reproducer.sh Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap1' At subvol /mnt/sdi/snap1 Create a readonly snapshot of '/mnt/sdi' in '/mnt/sdi/snap2' At subvol /mnt/sdi/snap2 At subvol snap1 At snapshot snap2 ERROR: rename d1/d2 -> o260-6-0 failed: No such file or directory This happens because: 1) When processing inode 257 we end up computing the name for inode 259 because it is an ancestor in the send snapshot, and at that point it still has its old name, "d1", from the parent snapshot because inode 259 was not yet processed. We then cache that name, which is valid until we start processing inode 259 (or set the progress to 260 after processing its references); 2) Later we start processing inode 258 and collecting all its new references into the list sctx->new_refs. The first reference in the list happens to be the reference for name "d1" while the reference for name "d2" is next (the last element of the list). We compute the full path "d1/d2" for this second reference and store it in the reference (its ->full_path member). The path used for the new parent directory was "d1" and not "f2" because inode 259, the new parent, was not yet processed; 3) When we start processing the new references at process_recorded_refs() we start with the first reference in the list, for the new name "d1". Because there is a conflicting inode that was not yet processed, which is directory inode 259, we orphanize it, renaming it from "d1" to "o259-6-0"; 4) Then we start processing the new reference for name "d2", and we realize it conflicts with the reference of inode 260 in the parent snapshot. So we issue an orphanization operation for inode 260 by emitting a rename operation with a destination path of "o260-6-0" and a source path of "d1/d2" - this source path is the value we stored in the reference earlier at step 2), corresponding to the ->full_path member of the reference, however that path is no longer valid due to the orphanization of the directory inode 259 in step 3). This makes the receiver fail since the path does not exists, it should have been "o259-6-0/d2". Fix this by recomputing the full path of a reference before emitting an orphanization if we previously orphanized any directory, since that directory could be a parent in the new path. This is a rare scenario so keeping it simple and not checking if that previously orphanized directory is in fact an ancestor of the inode we are trying to orphanize. A test case for fstests follows soon. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Josef Bacik Signed-off-by: Filipe Manana Signed-off-by: David Sterba --- fs/btrfs/send.c | 72 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 72 insertions(+) diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f9c14c33e753..340c76a12ce1 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -3805,6 +3805,72 @@ static int update_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) return 0; } +/* + * When processing the new references for an inode we may orphanize an existing + * directory inode because its old name conflicts with one of the new references + * of the current inode. Later, when processing another new reference of our + * inode, we might need to orphanize another inode, but the path we have in the + * reference reflects the pre-orphanization name of the directory we previously + * orphanized. For example: + * + * parent snapshot looks like: + * + * . (ino 256) + * |----- f1 (ino 257) + * |----- f2 (ino 258) + * |----- d1/ (ino 259) + * |----- d2/ (ino 260) + * + * send snapshot looks like: + * + * . (ino 256) + * |----- d1 (ino 258) + * |----- f2/ (ino 259) + * |----- f2_link/ (ino 260) + * | |----- f1 (ino 257) + * | + * |----- d2 (ino 258) + * + * When processing inode 257 we compute the name for inode 259 as "d1", and we + * cache it in the name cache. Later when we start processing inode 258, when + * collecting all its new references we set a full path of "d1/d2" for its new + * reference with name "d2". When we start processing the new references we + * start by processing the new reference with name "d1", and this results in + * orphanizing inode 259, since its old reference causes a conflict. Then we + * move on the next new reference, with name "d2", and we find out we must + * orphanize inode 260, as its old reference conflicts with ours - but for the + * orphanization we use a source path corresponding to the path we stored in the + * new reference, which is "d1/d2" and not "o259-6-0/d2" - this makes the + * receiver fail since the path component "d1/" no longer exists, it was renamed + * to "o259-6-0/" when processing the previous new reference. So in this case we + * must recompute the path in the new reference and use it for the new + * orphanization operation. + */ +static int refresh_ref_path(struct send_ctx *sctx, struct recorded_ref *ref) +{ + char *name; + int ret; + + name = kmemdup(ref->name, ref->name_len, GFP_KERNEL); + if (!name) + return -ENOMEM; + + fs_path_reset(ref->full_path); + ret = get_cur_path(sctx, ref->dir, ref->dir_gen, ref->full_path); + if (ret < 0) + goto out; + + ret = fs_path_add(ref->full_path, name, ref->name_len); + if (ret < 0) + goto out; + + /* Update the reference's base name pointer. */ + set_ref_path(ref, ref->full_path); +out: + kfree(name); + return ret; +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -3939,6 +4005,12 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) struct name_cache_entry *nce; struct waiting_dir_move *wdm; + if (orphanized_dir) { + ret = refresh_ref_path(sctx, cur); + if (ret < 0) + goto out; + } + ret = orphanize_inode(sctx, ow_inode, ow_gen, cur->full_path); if (ret < 0) From 9a446d6a9fc7df48e53c54b161dfdfc6f5031d4b Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:33 +0300 Subject: [PATCH 254/271] btrfs: replace readpage_end_io_hook with direct calls Don't call readpage_end_io_hook for the btree inode. Instead of relying on indirect calls to implement metadata buffer validation simply check if the inode whose page we are processing equals the btree inode. If it does call the necessary function. This is an improvement in 2 directions: 1. We aren't paying the penalty of indirect calls in a post-speculation attacks world. 2. The function is now named more explicitly so it's obvious what's going on This is in preparation to removing struct extent_io_ops altogether. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/disk-io.c | 8 ++++---- fs/btrfs/disk-io.h | 4 +++- fs/btrfs/extent_io.c | 9 ++++++--- fs/btrfs/inode.c | 7 +++---- 5 files changed, 18 insertions(+), 12 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 5cb8af6af25d..391c2b25cca1 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2969,6 +2969,8 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index ebc110231f30..1c84a7ea9894 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -524,9 +524,9 @@ static int check_tree_block_fsid(struct extent_buffer *eb) return 1; } -static int btree_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror) { u64 found_start; int found_level; @@ -4638,5 +4638,5 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = btree_readpage_end_io_hook, + .readpage_end_io_hook = NULL }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 89b6a709a184..bc2e49246199 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -76,7 +76,9 @@ void btrfs_btree_balance_dirty(struct btrfs_fs_info *fs_info); void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info); void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, struct btrfs_root *root); - +int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, + int mirror); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index afac70ef0cc5..6cee6b611d0c 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2851,9 +2851,12 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - ret = tree->ops->readpage_end_io_hook(io_bio, offset, - page, start, end, - mirror); + if (data_inode) + ret = btrfs_verify_data_csum(io_bio, offset, page, + start, end, mirror); + else + ret = btrfs_validate_metadata_buffer(io_bio, + offset, page, start, end, mirror); if (ret) uptodate = 0; else diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b8481632dfc7..982187b0cd59 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2832,9 +2832,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, * if there's a match, we allow the bio to finish. If not, the code in * extent_io.c will try to find good copies for us. */ -static int btrfs_readpage_end_io_hook(struct btrfs_io_bio *io_bio, - u64 phy_offset, struct page *page, - u64 start, u64 end, int mirror) +int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, + struct page *page, u64 start, u64 end, int mirror) { size_t offset = start - page_offset(page); struct inode *inode = page->mapping->host; @@ -10261,7 +10260,7 @@ static const struct file_operations btrfs_dir_file_operations = { static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = btrfs_readpage_end_io_hook, + .readpage_end_io_hook = NULL }; /* From 1f03d9cfda53d86e7822bf0dd5f0ef5d8fa6d1ad Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:34 +0300 Subject: [PATCH 255/271] btrfs: remove extent_io_ops::readpage_end_io_hook It's no longer used so let's remove it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 1 - fs/btrfs/extent_io.h | 5 +---- fs/btrfs/inode.c | 1 - 3 files changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 1c84a7ea9894..4c1f63924498 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -4638,5 +4638,4 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) static const struct extent_io_ops btree_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btree_submit_bio_hook, - .readpage_end_io_hook = NULL }; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 3bbc25b816ea..68c40902e571 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -76,13 +76,10 @@ typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct extent_io_ops { /* - * The following callbacks must be always defined, the function + * The following callback must be always defined, the function * pointer will be called unconditionally. */ submit_bio_hook_t *submit_bio_hook; - int (*readpage_end_io_hook)(struct btrfs_io_bio *io_bio, u64 phy_offset, - struct page *page, u64 start, u64 end, - int mirror); }; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 982187b0cd59..204cd9d39894 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -10260,7 +10260,6 @@ static const struct file_operations btrfs_dir_file_operations = { static const struct extent_io_ops btrfs_extent_io_ops = { /* mandatory callbacks */ .submit_bio_hook = btrfs_submit_bio_hook, - .readpage_end_io_hook = NULL }; /* From cd0537449c275ca7da6f6e3ea2389ae5225c6b4e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:35 +0300 Subject: [PATCH 256/271] btrfs: call submit_bio_hook directly in submit_one_bio BTRFS has 2 inode types (for the purposes of the code in submit_one_bio) - ordinary data inodes (including the freespace inode) and the btree inode. Both of these implement submit_bio_hook so btrfsic_submit_bio can never be called from submit_one_bio so just remove it. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 6cee6b611d0c..1a96251ade1f 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -168,11 +168,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; - if (tree->ops) - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); - else - btrfsic_submit_bio(bio); + ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, + bio_flags); return blk_status_to_errno(ret); } From be17b3afc4a6255008d4bbf7b1b45a36ac54bece Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:36 +0300 Subject: [PATCH 257/271] btrfs: don't opencode is_data_inode in end_bio_extent_readpage Use the is_data_inode helper. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/extent_io.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 1a96251ade1f..eb6094c87a55 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2816,8 +2816,6 @@ static void end_bio_extent_readpage(struct bio *bio) struct page *page = bvec->bv_page; struct inode *inode = page->mapping->host; struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); - bool data_inode = btrfs_ino(BTRFS_I(inode)) - != BTRFS_BTREE_INODE_OBJECTID; btrfs_debug(fs_info, "end_bio_extent_readpage: bi_sector=%llu, err=%d, mirror=%u", @@ -2848,7 +2846,7 @@ static void end_bio_extent_readpage(struct bio *bio) mirror = io_bio->mirror_num; if (likely(uptodate)) { - if (data_inode) + if (is_data_inode(inode)) ret = btrfs_verify_data_csum(io_bio, offset, page, start, end, mirror); else @@ -2866,7 +2864,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) goto readpage_ok; - if (data_inode) { + if (is_data_inode(inode)) { /* * The generic bio_readpage_error handles errors the From 908930f3edad79f7e5e35b7fac09da8c54ca96b5 Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:37 +0300 Subject: [PATCH 258/271] btrfs: stop calling submit_bio_hook for data inodes Instead export and rename the function to btrfs_submit_data_bio and call it directly in submit_one_bio. This avoids paying the cost for speculative attacks mitigations and improves code readability. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 ++ fs/btrfs/extent_io.c | 10 +++++++--- fs/btrfs/inode.c | 8 +++----- 3 files changed, 12 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 391c2b25cca1..75fe6b34e6a2 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -2969,6 +2969,8 @@ void btrfs_inode_safe_disk_i_size_write(struct inode *inode, u64 new_i_size); u64 btrfs_file_extent_end(const struct btrfs_path *path); /* inode.c */ +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index eb6094c87a55..bf1cbc171bec 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -168,8 +168,12 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, bio->bi_private = NULL; - ret = tree->ops->submit_bio_hook(tree->private_data, bio, mirror_num, - bio_flags); + if (is_data_inode(tree->private_data)) + ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, + bio_flags); + else + ret = tree->ops->submit_bio_hook(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } @@ -2879,7 +2883,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (!btrfs_submit_read_repair(inode, bio, offset, page, start - page_offset(page), start, end, mirror, - tree->ops->submit_bio_hook)) { + btrfs_submit_data_bio)) { uptodate = !bio->bi_status; offset += len; continue; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 204cd9d39894..dddff27774a1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -2184,9 +2184,8 @@ static blk_status_t btrfs_submit_bio_start(void *private_data, struct bio *bio, * * c-3) otherwise: async submit */ -static blk_status_t btrfs_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); @@ -10258,8 +10257,7 @@ static const struct file_operations btrfs_dir_file_operations = { }; static const struct extent_io_ops btrfs_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btrfs_submit_bio_hook, + .submit_bio_hook = NULL }; /* From 1b36294a6cd50ee5dcfee71ba5241cbb60f0295e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:38 +0300 Subject: [PATCH 259/271] btrfs: call submit_bio_hook directly for metadata pages No need to go through a function pointer indirection simply call submit_bio_hook directly by exporting and renaming the helper to btrfs_submit_metadata_bio. This makes the code more readable and should result in somewhat faster code due to no longer paying the price for specualtive attack mitigations that come with indirect function calls. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 +++----- fs/btrfs/disk-io.h | 2 ++ fs/btrfs/extent_io.c | 4 ++-- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 4c1f63924498..d07ec69eb525 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -814,9 +814,8 @@ static int check_async_write(struct btrfs_fs_info *fs_info, return 1; } -static blk_status_t btree_submit_bio_hook(struct inode *inode, struct bio *bio, - int mirror_num, - unsigned long bio_flags) +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); int async = check_async_write(fs_info, BTRFS_I(inode)); @@ -4636,6 +4635,5 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) } static const struct extent_io_ops btree_extent_io_ops = { - /* mandatory callbacks */ - .submit_bio_hook = btree_submit_bio_hook, + .submit_bio_hook = NULL }; diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index bc2e49246199..fee69ced58b4 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -79,6 +79,8 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info, int btrfs_validate_metadata_buffer(struct btrfs_io_bio *io_bio, u64 phy_offset, struct page *page, u64 start, u64 end, int mirror); +blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, + int mirror_num, unsigned long bio_flags); #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info); #endif diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index bf1cbc171bec..199dcc7d2db6 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -172,8 +172,8 @@ int __must_check submit_one_bio(struct bio *bio, int mirror_num, ret = btrfs_submit_data_bio(tree->private_data, bio, mirror_num, bio_flags); else - ret = tree->ops->submit_bio_hook(tree->private_data, bio, - mirror_num, bio_flags); + ret = btrfs_submit_metadata_bio(tree->private_data, bio, + mirror_num, bio_flags); return blk_status_to_errno(ret); } From 905eb88bceb253aa0f475eb04eec149d6059126e Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Fri, 18 Sep 2020 16:34:39 +0300 Subject: [PATCH 260/271] btrfs: remove struct extent_io_ops It's no longer used just remove the function and any related code which was initialising it for inodes. No functional changes. Removing 8 bytes from extent_io_tree in turn reduces size of other structures where it is embedded, notably btrfs_inode where it reduces size by 24 bytes. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.h | 2 -- fs/btrfs/disk-io.c | 7 ------- fs/btrfs/extent-io-tree.h | 1 - fs/btrfs/extent_io.c | 2 -- fs/btrfs/extent_io.h | 9 --------- fs/btrfs/inode.c | 16 ---------------- fs/btrfs/tests/inode-tests.c | 1 - 7 files changed, 38 deletions(-) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 75fe6b34e6a2..aac3d6f4e35b 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3583,9 +3583,7 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode); void btrfs_test_destroy_inode(struct inode *inode); - static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info) { return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state); diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index d07ec69eb525..3d39f5d47ad3 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -50,7 +50,6 @@ BTRFS_SUPER_FLAG_METADUMP |\ BTRFS_SUPER_FLAG_METADUMP_V2) -static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void btrfs_destroy_ordered_extents(struct btrfs_root *root); static int btrfs_destroy_delayed_refs(struct btrfs_transaction *trans, @@ -2065,8 +2064,6 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info) BTRFS_I(inode)->io_tree.track_uptodate = false; extent_map_tree_init(&BTRFS_I(inode)->extent_tree); - BTRFS_I(inode)->io_tree.ops = &btree_extent_io_ops; - BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); memset(&BTRFS_I(inode)->location, 0, sizeof(struct btrfs_key)); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -4633,7 +4630,3 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) return 0; } - -static const struct extent_io_ops btree_extent_io_ops = { - .submit_bio_hook = NULL -}; diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 53e7b5b5d6ad..9800a8306368 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -63,7 +63,6 @@ struct extent_io_tree { u8 owner; spinlock_t lock; - const struct extent_io_ops *ops; }; struct extent_state { diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 199dcc7d2db6..60f5f68d892d 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -281,7 +281,6 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, { tree->fs_info = fs_info; tree->state = RB_ROOT; - tree->ops = NULL; tree->dirty_bytes = 0; spin_lock_init(&tree->lock); tree->private_data = private_data; @@ -3055,7 +3054,6 @@ static int submit_extent_page(unsigned int opf, else contig = bio_end_sector(bio) == sector; - ASSERT(tree->ops); if (btrfs_bio_fits_in_stripe(page, page_size, bio, bio_flags)) can_merge = false; diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 68c40902e571..f39d02e7f7ef 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -74,15 +74,6 @@ typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio, typedef blk_status_t (extent_submit_bio_start_t)(void *private_data, struct bio *bio, u64 bio_offset); -struct extent_io_ops { - /* - * The following callback must be always defined, the function - * pointer will be called unconditionally. - */ - submit_bio_hook_t *submit_bio_hook; -}; - - #define INLINE_EXTENT_BUFFER_PAGES 16 #define MAX_INLINE_EXTENT_BUFFER_SIZE (INLINE_EXTENT_BUFFER_PAGES * PAGE_SIZE) struct extent_buffer { diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index dddff27774a1..d526833b5ec0 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -71,7 +71,6 @@ static const struct inode_operations btrfs_special_inode_operations; static const struct inode_operations btrfs_file_inode_operations; static const struct address_space_operations btrfs_aops; static const struct file_operations btrfs_dir_file_operations; -static const struct extent_io_ops btrfs_extent_io_ops; static struct kmem_cache *btrfs_inode_cachep; struct kmem_cache *btrfs_trans_handle_cachep; @@ -141,13 +140,6 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode, static int btrfs_dirty_inode(struct inode *inode); -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS -void btrfs_test_inode_set_ops(struct inode *inode) -{ - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; -} -#endif - static int btrfs_init_inode_security(struct btrfs_trans_handle *trans, struct inode *inode, struct inode *dir, const struct qstr *qstr) @@ -3391,7 +3383,6 @@ static int btrfs_read_locked_inode(struct inode *inode, switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; break; @@ -6307,7 +6298,6 @@ static int btrfs_create(struct inode *dir, struct dentry *dentry, if (err) goto out_unlock; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; d_instantiate_new(dentry, inode); out_unlock: @@ -9518,7 +9508,6 @@ static int btrfs_symlink(struct inode *dir, struct dentry *dentry, inode->i_fop = &btrfs_file_operations; inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; err = btrfs_init_inode_security(trans, inode, dir, &dentry->d_name); if (err) @@ -9838,7 +9827,6 @@ static int btrfs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode) inode->i_op = &btrfs_file_inode_operations; inode->i_mapping->a_ops = &btrfs_aops; - BTRFS_I(inode)->io_tree.ops = &btrfs_extent_io_ops; ret = btrfs_init_inode_security(trans, inode, dir, NULL); if (ret) @@ -10256,10 +10244,6 @@ static const struct file_operations btrfs_dir_file_operations = { .fsync = btrfs_sync_file, }; -static const struct extent_io_ops btrfs_extent_io_ops = { - .submit_bio_hook = NULL -}; - /* * btrfs doesn't support the bmap operation because swapfiles * use bmap to make a mapping of extents in the file. They assume diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index cc54d4973a74..e6719f7db386 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -949,7 +949,6 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } BTRFS_I(inode)->root = root; - btrfs_test_inode_set_ops(inode); /* [BTRFS_MAX_EXTENT_SIZE] */ ret = btrfs_set_extent_delalloc(BTRFS_I(inode), 0, From 124604eb50f88e6c1ff6587bdd2fc09bea810b32 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Sep 2020 16:44:32 -0400 Subject: [PATCH 261/271] btrfs: init device stats for seed devices We recently started recording device stats across the fleet, and noticed a large increase in messages such as this BTRFS warning (device dm-0): get dev_stats failed, not yet valid on our tiers that use seed devices for their root devices. This is because we do not initialize the device stats for any seed devices if we have a sprout device and mount using that sprout device. The basic steps for reproducing are: $ mkfs seed device $ mount seed device # fill seed device $ umount seed device $ btrfstune -S 1 seed device $ mount seed device $ btrfs device add -f sprout device /mnt/wherever $ umount /mnt/wherever $ mount sprout device /mnt/wherever $ btrfs device stats /mnt/wherever This will fail with the above message in dmesg. Fix this by iterating over the fs_devices->seed if they exist in btrfs_init_dev_stats. This fixed the problem and properly reports the stats for both devices. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Reviewed-by: David Sterba [ rename to btrfs_device_init_dev_stats ] Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 87 ++++++++++++++++++++++++---------------------- 1 file changed, 46 insertions(+), 41 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 6c7c8819cb31..d25650600179 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7223,61 +7223,66 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } +static void btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) +{ + struct btrfs_dev_stats_item *ptr; + struct extent_buffer *eb; + struct btrfs_key key; + int item_size; + int i, ret, slot; + + key.objectid = BTRFS_DEV_STATS_OBJECTID; + key.type = BTRFS_PERSISTENT_ITEM_KEY; + key.offset = device->devid; + ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0); + if (ret) { + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) + btrfs_dev_stat_set(device, i, 0); + device->dev_stats_valid = 1; + btrfs_release_path(path); + return; + } + slot = path->slots[0]; + eb = path->nodes[0]; + item_size = btrfs_item_size_nr(eb, slot); + + ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item); + + for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { + if (item_size >= (1 + i) * sizeof(__le64)) + btrfs_dev_stat_set(device, i, + btrfs_dev_stats_value(eb, ptr, i)); + else + btrfs_dev_stat_set(device, i, 0); + } + + device->dev_stats_valid = 1; + btrfs_dev_stat_print_on_load(device); + btrfs_release_path(path); +} + int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) { - struct btrfs_key key; - struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_fs_devices *fs_devices = fs_info->fs_devices; - struct extent_buffer *eb; - int slot; - int ret = 0; + struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; - int i; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) { - int item_size; - struct btrfs_dev_stats_item *ptr; - - key.objectid = BTRFS_DEV_STATS_OBJECTID; - key.type = BTRFS_PERSISTENT_ITEM_KEY; - key.offset = device->devid; - ret = btrfs_search_slot(NULL, dev_root, &key, path, 0, 0); - if (ret) { - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) - btrfs_dev_stat_set(device, i, 0); - device->dev_stats_valid = 1; - btrfs_release_path(path); - continue; - } - slot = path->slots[0]; - eb = path->nodes[0]; - item_size = btrfs_item_size_nr(eb, slot); - - ptr = btrfs_item_ptr(eb, slot, - struct btrfs_dev_stats_item); - - for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) { - if (item_size >= (1 + i) * sizeof(__le64)) - btrfs_dev_stat_set(device, i, - btrfs_dev_stats_value(eb, ptr, i)); - else - btrfs_dev_stat_set(device, i, 0); - } - - device->dev_stats_valid = 1; - btrfs_dev_stat_print_on_load(device); - btrfs_release_path(path); + list_for_each_entry(device, &fs_devices->devices, dev_list) + btrfs_device_init_dev_stats(device, path); + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) + btrfs_device_init_dev_stats(device, path); } mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return ret < 0 ? ret : 0; + return 0; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, From 92e26df43b1a9725c205a7e4416240ce72eac4a3 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Fri, 18 Sep 2020 16:44:33 -0400 Subject: [PATCH 262/271] btrfs: return error if we're unable to read device stats I noticed when fixing device stats for seed devices that we simply threw away the return value from btrfs_search_slot(). This is because we may not have stat items, but we could very well get an error, and thus miss reporting the error up the chain. Fix this by returning ret if it's an actual error, and then stop trying to init the rest of the devices stats and return the error up the chain. Reviewed-by: Anand Jain Signed-off-by: Josef Bacik Signed-off-by: David Sterba --- fs/btrfs/volumes.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index d25650600179..46f4efd58652 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -7223,8 +7223,8 @@ static void btrfs_set_dev_stats_value(struct extent_buffer *eb, sizeof(val)); } -static void btrfs_device_init_dev_stats(struct btrfs_device *device, - struct btrfs_path *path) +static int btrfs_device_init_dev_stats(struct btrfs_device *device, + struct btrfs_path *path) { struct btrfs_dev_stats_item *ptr; struct extent_buffer *eb; @@ -7241,7 +7241,7 @@ static void btrfs_device_init_dev_stats(struct btrfs_device *device, btrfs_dev_stat_set(device, i, 0); device->dev_stats_valid = 1; btrfs_release_path(path); - return; + return ret < 0 ? ret : 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -7260,6 +7260,8 @@ static void btrfs_device_init_dev_stats(struct btrfs_device *device, device->dev_stats_valid = 1; btrfs_dev_stat_print_on_load(device); btrfs_release_path(path); + + return 0; } int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) @@ -7267,22 +7269,30 @@ int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info) struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs; struct btrfs_device *device; struct btrfs_path *path = NULL; + int ret = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; mutex_lock(&fs_devices->device_list_mutex); - list_for_each_entry(device, &fs_devices->devices, dev_list) - btrfs_device_init_dev_stats(device, path); - list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { - list_for_each_entry(device, &seed_devs->devices, dev_list) - btrfs_device_init_dev_stats(device, path); + list_for_each_entry(device, &fs_devices->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; } + list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) { + list_for_each_entry(device, &seed_devs->devices, dev_list) { + ret = btrfs_device_init_dev_stats(device, path); + if (ret) + goto out; + } + } +out: mutex_unlock(&fs_devices->device_list_mutex); btrfs_free_path(path); - return 0; + return ret; } static int update_dev_stat_item(struct btrfs_trans_handle *trans, From c33fe275b530f1ce1ab99923d50eb399f53ed545 Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:08 -0500 Subject: [PATCH 263/271] fs: remove no longer used dio_end_io() Since we removed the last user of dio_end_io() when btrfs got converted to iomap infrastructure ("btrfs: switch to iomap for direct IO"), remove the helper function dio_end_io(). Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Christoph Hellwig Reviewed-by: Josef Bacik Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/direct-io.c | 19 ------------------- include/linux/fs.h | 2 -- 2 files changed, 21 deletions(-) diff --git a/fs/direct-io.c b/fs/direct-io.c index 183299892465..abf535b036ab 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -386,25 +386,6 @@ static void dio_bio_end_io(struct bio *bio) spin_unlock_irqrestore(&dio->bio_lock, flags); } -/** - * dio_end_io - handle the end io action for the given bio - * @bio: The direct io bio thats being completed - * - * This is meant to be called by any filesystem that uses their own dio_submit_t - * so that the DIO specific endio actions are dealt with after the filesystem - * has done it's completion work. - */ -void dio_end_io(struct bio *bio) -{ - struct dio *dio = bio->bi_private; - - if (dio->is_async) - dio_bio_end_aio(bio); - else - dio_bio_end_io(bio); -} -EXPORT_SYMBOL_GPL(dio_end_io); - static inline void dio_bio_alloc(struct dio *dio, struct dio_submit *sdio, struct block_device *bdev, diff --git a/include/linux/fs.h b/include/linux/fs.h index 7519ae003a08..8e2842a9c0b3 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -3079,8 +3079,6 @@ enum { DIO_SKIP_HOLES = 0x02, }; -void dio_end_io(struct bio *bio); - ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, struct block_device *bdev, struct iov_iter *iter, get_block_t get_block, From e3c57805f8f2215a1586741009111b92930cc6ac Mon Sep 17 00:00:00 2001 From: Goldwyn Rodrigues Date: Thu, 24 Sep 2020 11:39:09 -0500 Subject: [PATCH 264/271] btrfs: remove BTRFS_INODE_READDIO_NEED_LOCK Since we now perform direct reads using i_rwsem, we can remove this inode flag used to co-ordinate unlocked reads. The truncate call takes i_rwsem. This means it is correctly synchronized with concurrent direct reads. Reviewed-by: Nikolay Borisov Reviewed-by: Johannes Thumshirn Reviewed-by: Josef Bacik Reviewed-by: Christoph Hellwig Signed-off-by: Goldwyn Rodrigues Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 18 ------------------ fs/btrfs/inode.c | 3 --- 2 files changed, 21 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 6fdb46d58299..738009a22320 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -33,7 +33,6 @@ enum { BTRFS_INODE_NEEDS_FULL_SYNC, BTRFS_INODE_COPY_EVERYTHING, BTRFS_INODE_IN_DELALLOC_LIST, - BTRFS_INODE_READDIO_NEED_LOCK, BTRFS_INODE_HAS_PROPS, BTRFS_INODE_SNAPSHOT_FLUSH, }; @@ -334,23 +333,6 @@ struct btrfs_dio_private { u8 csums[]; }; -/* - * Disable DIO read nolock optimization, so new dio readers will be forced - * to grab i_mutex. It is used to avoid the endless truncate due to - * nonlocked dio read. - */ -static inline void btrfs_inode_block_unlocked_dio(struct btrfs_inode *inode) -{ - set_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); - smp_mb(); -} - -static inline void btrfs_inode_resume_unlocked_dio(struct btrfs_inode *inode) -{ - smp_mb__before_atomic(); - clear_bit(BTRFS_INODE_READDIO_NEED_LOCK, &inode->runtime_flags); -} - /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index d526833b5ec0..36efed0a24de 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4844,10 +4844,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) truncate_setsize(inode, newsize); - /* Disable nonlocked read DIO to avoid the endless truncate */ - btrfs_inode_block_unlocked_dio(BTRFS_I(inode)); inode_dio_wait(inode); - btrfs_inode_resume_unlocked_dio(BTRFS_I(inode)); ret = btrfs_truncate(inode, newsize == oldsize); if (ret && inode->i_nlink) { From 572c83acdcdafeb04e70aa46be1fa539310be20c Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Tue, 29 Sep 2020 08:53:54 -0400 Subject: [PATCH 265/271] btrfs: cleanup cow block on error In fstest btrfs/064 a transaction abort in __btrfs_cow_block could lead to a system lockup. It gets stuck trying to write back inodes, and the write back thread was trying to lock an extent buffer: $ cat /proc/2143497/stack [<0>] __btrfs_tree_lock+0x108/0x250 [<0>] lock_extent_buffer_for_io+0x35e/0x3a0 [<0>] btree_write_cache_pages+0x15a/0x3b0 [<0>] do_writepages+0x28/0xb0 [<0>] __writeback_single_inode+0x54/0x5c0 [<0>] writeback_sb_inodes+0x1e8/0x510 [<0>] wb_writeback+0xcc/0x440 [<0>] wb_workfn+0xd7/0x650 [<0>] process_one_work+0x236/0x560 [<0>] worker_thread+0x55/0x3c0 [<0>] kthread+0x13a/0x150 [<0>] ret_from_fork+0x1f/0x30 This is because we got an error while COWing a block, specifically here if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { btrfs_abort_transaction(trans, ret); return ret; } } [16402.241552] BTRFS: Transaction aborted (error -2) [16402.242362] WARNING: CPU: 1 PID: 2563188 at fs/btrfs/ctree.c:1074 __btrfs_cow_block+0x376/0x540 [16402.249469] CPU: 1 PID: 2563188 Comm: fsstress Not tainted 5.9.0-rc6+ #8 [16402.249936] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.13.0-2.fc32 04/01/2014 [16402.250525] RIP: 0010:__btrfs_cow_block+0x376/0x540 [16402.252417] RSP: 0018:ffff9cca40e578b0 EFLAGS: 00010282 [16402.252787] RAX: 0000000000000025 RBX: 0000000000000002 RCX: ffff9132bbd19388 [16402.253278] RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9132bbd19380 [16402.254063] RBP: ffff9132b41a49c0 R08: 0000000000000000 R09: 0000000000000000 [16402.254887] R10: 0000000000000000 R11: ffff91324758b080 R12: ffff91326ef17ce0 [16402.255694] R13: ffff91325fc0f000 R14: ffff91326ef176b0 R15: ffff9132815e2000 [16402.256321] FS: 00007f542c6d7b80(0000) GS:ffff9132bbd00000(0000) knlGS:0000000000000000 [16402.256973] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 [16402.257374] CR2: 00007f127b83f250 CR3: 0000000133480002 CR4: 0000000000370ee0 [16402.257867] Call Trace: [16402.258072] btrfs_cow_block+0x109/0x230 [16402.258356] btrfs_search_slot+0x530/0x9d0 [16402.258655] btrfs_lookup_file_extent+0x37/0x40 [16402.259155] __btrfs_drop_extents+0x13c/0xd60 [16402.259628] ? btrfs_block_rsv_migrate+0x4f/0xb0 [16402.259949] btrfs_replace_file_extents+0x190/0x820 [16402.260873] btrfs_clone+0x9ae/0xc00 [16402.261139] btrfs_extent_same_range+0x66/0x90 [16402.261771] btrfs_remap_file_range+0x353/0x3b1 [16402.262333] vfs_dedupe_file_range_one.part.0+0xd5/0x140 [16402.262821] vfs_dedupe_file_range+0x189/0x220 [16402.263150] do_vfs_ioctl+0x552/0x700 [16402.263662] __x64_sys_ioctl+0x62/0xb0 [16402.264023] do_syscall_64+0x33/0x40 [16402.264364] entry_SYSCALL_64_after_hwframe+0x44/0xa9 [16402.264862] RIP: 0033:0x7f542c7d15cb [16402.266901] RSP: 002b:00007ffd35944ea8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 [16402.267627] RAX: ffffffffffffffda RBX: 00000000009d1968 RCX: 00007f542c7d15cb [16402.268298] RDX: 00000000009d2490 RSI: 00000000c0189436 RDI: 0000000000000003 [16402.268958] RBP: 00000000009d2520 R08: 0000000000000036 R09: 00000000009d2e64 [16402.269726] R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000002 [16402.270659] R13: 000000000001f000 R14: 00000000009d1970 R15: 00000000009d2e80 [16402.271498] irq event stamp: 0 [16402.271846] hardirqs last enabled at (0): [<0000000000000000>] 0x0 [16402.272497] hardirqs last disabled at (0): [] copy_process+0x6b9/0x1ba0 [16402.273343] softirqs last enabled at (0): [] copy_process+0x6b9/0x1ba0 [16402.273905] softirqs last disabled at (0): [<0000000000000000>] 0x0 [16402.274338] ---[ end trace 737874a5a41a8236 ]--- [16402.274669] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry [16402.276179] BTRFS info (device dm-9): forced readonly [16402.277046] BTRFS: error (device dm-9) in btrfs_replace_file_extents:2723: errno=-2 No such entry [16402.278744] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry [16402.279968] BTRFS: error (device dm-9) in __btrfs_cow_block:1074: errno=-2 No such entry [16402.280582] BTRFS info (device dm-9): balance: ended with status: -30 The problem here is that as soon as we allocate the new block it is locked and marked dirty in the btree inode. This means that we could attempt to writeback this block and need to lock the extent buffer. However we're not unlocking it here and thus we deadlock. Fix this by unlocking the cow block if we have any errors inside of __btrfs_cow_block, and also free it so we do not leak it. CC: stable@vger.kernel.org # 4.4+ Reviewed-by: Filipe Manana Signed-off-by: Josef Bacik Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/ctree.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index a165093739c4..113da62dc17f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -1064,6 +1064,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, ret = update_ref_for_cow(trans, root, buf, cow, &last_ref); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1071,6 +1073,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { ret = btrfs_reloc_cow_block(trans, root, buf, cow); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } @@ -1103,6 +1107,8 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, if (last_ref) { ret = tree_mod_log_free_eb(buf); if (ret) { + btrfs_tree_unlock(cow); + free_extent_buffer(cow); btrfs_abort_transaction(trans, ret); return ret; } From 96c2e067ed3e3e004580a643c76f58729206b829 Mon Sep 17 00:00:00 2001 From: Anand Jain Date: Wed, 30 Sep 2020 21:09:52 +0800 Subject: [PATCH 266/271] btrfs: skip devices without magic signature when mounting Many things can happen after the device is scanned and before the device is mounted. One such thing is losing the BTRFS_MAGIC on the device. If it happens we still won't free that device from the memory and cause the userland confusion. For example: As the BTRFS_IOC_DEV_INFO still carries the device path which does not have the BTRFS_MAGIC, 'btrfs fi show' still lists device which does not belong to the filesystem anymore: $ mkfs.btrfs -fq -draid1 -mraid1 /dev/sda /dev/sdb $ wipefs -a /dev/sdb # /dev/sdb does not contain magic signature $ mount -o degraded /dev/sda /btrfs $ btrfs fi show -m Label: none uuid: 470ec6fb-646b-4464-b3cb-df1b26c527bd Total devices 2 FS bytes used 128.00KiB devid 1 size 3.00GiB used 571.19MiB path /dev/sda devid 2 size 3.00GiB used 571.19MiB path /dev/sdb We need to distinguish the missing signature and invalid superblock, so add a specific error code ENODATA for that. This also fixes failure of fstest btrfs/198. CC: stable@vger.kernel.org # 4.19+ Reviewed-by: Josef Bacik Signed-off-by: Anand Jain Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/disk-io.c | 8 ++++++-- fs/btrfs/volumes.c | 18 ++++++++++++------ 2 files changed, 18 insertions(+), 8 deletions(-) diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 3d39f5d47ad3..764001609a15 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -3424,8 +3424,12 @@ struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, return ERR_CAST(page); super = page_address(page); - if (btrfs_super_bytenr(super) != bytenr || - btrfs_super_magic(super) != BTRFS_MAGIC) { + if (btrfs_super_magic(super) != BTRFS_MAGIC) { + btrfs_release_disk_super(super); + return ERR_PTR(-ENODATA); + } + + if (btrfs_super_bytenr(super) != bytenr) { btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 46f4efd58652..58b9c419a2b6 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1198,17 +1198,23 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, { struct btrfs_device *device; struct btrfs_device *latest_dev = NULL; + struct btrfs_device *tmp_device; flags |= FMODE_EXCL; - list_for_each_entry(device, &fs_devices->devices, dev_list) { - /* Just open everything we can; ignore failures here */ - if (btrfs_open_one_device(fs_devices, device, flags, holder)) - continue; + list_for_each_entry_safe(device, tmp_device, &fs_devices->devices, + dev_list) { + int ret; - if (!latest_dev || - device->generation > latest_dev->generation) + ret = btrfs_open_one_device(fs_devices, device, flags, holder); + if (ret == 0 && + (!latest_dev || device->generation > latest_dev->generation)) { latest_dev = device; + } else if (ret == -ENODATA) { + fs_devices->num_devices--; + list_del(&device->dev_list); + btrfs_free_device(device); + } } if (fs_devices->open_devices == 0) return -EINVAL; From 8d1a7aae89dc0c41ffb76fe1007dbba59d13881b Mon Sep 17 00:00:00 2001 From: Madhuparna Bhowmik Date: Thu, 5 Dec 2019 01:49:01 +0530 Subject: [PATCH 267/271] btrfs: annotate device name rcu_string with __rcu This patch fixes the following sparse errors in fs/btrfs/super.c in function btrfs_show_devname() fs/btrfs/super.c: error: incompatible types in comparison expression (different address spaces): fs/btrfs/super.c: struct rcu_string [noderef] * fs/btrfs/super.c: struct rcu_string * The error was because of the following line in function btrfs_show_devname(): if (first_dev) seq_escape(m, rcu_str_deref(first_dev->name), " \t\n\\"); Annotating the btrfs_device::name member with __rcu fixes the sparse error. Acked-by: Joel Fernandes (Google) Signed-off-by: Madhuparna Bhowmik Signed-off-by: David Sterba --- fs/btrfs/volumes.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 48bdca01e237..bf27ac07d315 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -58,7 +58,7 @@ struct btrfs_device { struct btrfs_fs_devices *fs_devices; struct btrfs_fs_info *fs_info; - struct rcu_string *name; + struct rcu_string __rcu *name; u64 generation; From 1fd4033dd011a3525bacddf37ab9eac425d25c4f Mon Sep 17 00:00:00 2001 From: Nikolay Borisov Date: Thu, 1 Oct 2020 09:40:39 +0300 Subject: [PATCH 268/271] btrfs: rename BTRFS_INODE_ORDERED_DATA_CLOSE flag Commit 8d875f95da43 ("btrfs: disable strict file flushes for renames and truncates") eliminated the notion of ordered operations and instead BTRFS_INODE_ORDERED_DATA_CLOSE only remained as a flag indicating that a file's content should be synced to disk in case a file is truncated and any writes happen to it concurrently. In fact this intendend behavior was broken until it was fixed in f6dc45c7a93a ("Btrfs: fix filemap_flush call in btrfs_file_release"). All things considered let's give the flag a more descriptive name. Also slightly reword comments. Signed-off-by: Nikolay Borisov Reviewed-by: David Sterba Signed-off-by: David Sterba --- fs/btrfs/btrfs_inode.h | 2 +- fs/btrfs/file.c | 10 +++++----- fs/btrfs/inode.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 738009a22320..92dd86bceae3 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -21,7 +21,7 @@ * new data the application may have written before commit. */ enum { - BTRFS_INODE_ORDERED_DATA_CLOSE, + BTRFS_INODE_FLUSH_ON_CLOSE, BTRFS_INODE_DUMMY, BTRFS_INODE_IN_DEFRAG, BTRFS_INODE_HAS_ASYNC_EXTENT, diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 038e0afaf3d0..0ff659455b1e 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2091,12 +2091,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp) filp->private_data = NULL; /* - * ordered_data_close is set by setattr when we are about to truncate - * a file from a non-zero size to a zero size. This tries to - * flush down new bytes that may have been written if the - * application were using truncate to replace a file in place. + * Set by setattr when we are about to truncate a file from a non-zero + * size to a zero size. This tries to flush down new bytes that may + * have been written if the application were using truncate to replace + * a file in place. */ - if (test_and_clear_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags)) filemap_flush(inode->i_mapping); return 0; diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 36efed0a24de..936c3137c646 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -4835,11 +4835,11 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) /* * We're truncating a file that used to have good data down to - * zero. Make sure it gets into the ordered flush list so that - * any new writes get down to disk quickly. + * zero. Make sure any new writes to the file get on disk + * on close. */ if (newsize == 0) - set_bit(BTRFS_INODE_ORDERED_DATA_CLOSE, + set_bit(BTRFS_INODE_FLUSH_ON_CLOSE, &BTRFS_I(inode)->runtime_flags); truncate_setsize(inode, newsize); From db1af1e9712920f47b5dc6a995fca3eec05ea85e Mon Sep 17 00:00:00 2001 From: Wolfram Sang Date: Thu, 8 Oct 2020 11:25:33 +0200 Subject: [PATCH 269/271] mmc: renesas_sdhi: workaround a regression when reinserting SD cards After the conversions of the reset routines, re-inserting SD cards didn't work anymore. Apply this temporary workaround to have working SD cards during the merge window. The issue will be fixed properly until the final release. Signed-off-by: Wolfram Sang Link: https://lore.kernel.org/r/20201008092533.76588-1-wsa+renesas@sang-engineering.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/renesas_sdhi_core.c | 13 +++++++++++++ 1 file changed, 13 insertions(+) diff --git a/drivers/mmc/host/renesas_sdhi_core.c b/drivers/mmc/host/renesas_sdhi_core.c index 20e5eb63caf8..414314151d0a 100644 --- a/drivers/mmc/host/renesas_sdhi_core.c +++ b/drivers/mmc/host/renesas_sdhi_core.c @@ -572,6 +572,17 @@ static void renesas_sdhi_reset(struct tmio_mmc_host *host) TMIO_MASK_INIT_RCAR2); } +/* + * This is a temporary workaround! This driver used 'hw_reset' wrongly and the + * fix for that showed a regression. So, we mimic the old behaviour until the + * proper solution is found. + */ +static void renesas_sdhi_hw_reset(struct mmc_host *mmc) +{ + struct tmio_mmc_host *host = mmc_priv(mmc); + renesas_sdhi_reset(host); +} + #define SH_MOBILE_SDHI_MIN_TAP_ROW 3 static int renesas_sdhi_select_tuning(struct tmio_mmc_host *host) @@ -1009,6 +1020,8 @@ int renesas_sdhi_probe(struct platform_device *pdev, if (of_data && of_data->scc_offset) { priv->scc_ctl = host->ctl + of_data->scc_offset; host->reset = renesas_sdhi_reset; + host->ops.hw_reset = renesas_sdhi_hw_reset; + host->mmc->caps |= MMC_CAP_HW_RESET; } } From 915f4c9358db6f96f08934dd683ae297aaa0fb91 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Mon, 5 Oct 2020 15:15:50 +0800 Subject: [PATCH 270/271] erofs: remove unnecessary enum entries Opt_nouser_xattr and Opt_noacl are useless, so just remove them. Signed-off-by: Chengguang Xu Reviewed-by: Gao Xiang Reviewed-by: Chao Yu Link: https://lore.kernel.org/r/20201005071550.66193-1-cgxu519@mykernel.net Signed-off-by: Gao Xiang --- fs/erofs/super.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/erofs/super.c b/fs/erofs/super.c index ddaa516c008a..b9a09806512a 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -211,9 +211,7 @@ static void erofs_default_options(struct erofs_fs_context *ctx) enum { Opt_user_xattr, - Opt_nouser_xattr, Opt_acl, - Opt_noacl, Opt_cache_strategy, Opt_err }; From 1e23400f1a7342a2805cc647e6314cd12bfb5526 Mon Sep 17 00:00:00 2001 From: Faiz Abbas Date: Thu, 8 Oct 2020 15:31:29 +0530 Subject: [PATCH 271/271] mmc: sdhci_am654: Fix module autoload Add a MODULE_DEVICE_TABLE() entry so that the driver is autoloaded when built as a module. Signed-off-by: Faiz Abbas Link: https://lore.kernel.org/r/20201008100129.13917-1-faiz_abbas@ti.com Signed-off-by: Ulf Hansson --- drivers/mmc/host/sdhci_am654.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/mmc/host/sdhci_am654.c b/drivers/mmc/host/sdhci_am654.c index 2bce962bf7e4..a64ea143d185 100644 --- a/drivers/mmc/host/sdhci_am654.c +++ b/drivers/mmc/host/sdhci_am654.c @@ -739,6 +739,7 @@ static const struct of_device_id sdhci_am654_of_match[] = { }, { /* sentinel */ } }; +MODULE_DEVICE_TABLE(of, sdhci_am654_of_match); static int sdhci_am654_probe(struct platform_device *pdev) {