From ba54d3ced958435e1802daf992cfd44c26cd4cb7 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:01:58 -0700 Subject: [PATCH 1/6] RDS: fix the dangling reference to rds_ib_incoming_slab On rds_ib_frag_slab allocation failure, ensure rds_ib_incoming_slab is not pointing to the detsroyed memory. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_recv.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c index ed9b41e3b277..6bbe62060060 100644 --- a/net/rds/ib_recv.c +++ b/net/rds/ib_recv.c @@ -1102,9 +1102,10 @@ int rds_ib_recv_init(void) rds_ib_frag_slab = kmem_cache_create("rds_ib_frag", sizeof(struct rds_page_frag), 0, SLAB_HWCACHE_ALIGN, NULL); - if (!rds_ib_frag_slab) + if (!rds_ib_frag_slab) { kmem_cache_destroy(rds_ib_incoming_slab); - else + rds_ib_incoming_slab = NULL; + } else ret = 0; out: return ret; From 3f6b3143031b678a8577df1f24ca977510aefcf5 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:01:59 -0700 Subject: [PATCH 2/6] RDS: Fix rds MR reference count in rds_rdma_unuse() rds_rdma_unuse() drops the mr reference count which it hasn't taken. Correct way of removing mr is to remove mr from the tree and then rdma_destroy_mr() it first, then rds_mr_put() to decrement its reference count. Whichever thread holds last reference will free the mr via rds_mr_put() This bug was triggering weird null pointer crashes. One if the trace for it is captured below. BUG: unable to handle kernel NULL pointer dereference at 0000000000000104 IP: [] rds_ib_free_mr+0x31/0x130 [rds_rdma] PGD 4366fa067 PUD 4366f9067 PMD 0 Oops: 0000 [#1] SMP [...] task: ffff88046da6a000 ti: ffff88046da6c000 task.ti: ffff88046da6c000 RIP: 0010:[] [] rds_ib_free_mr+0x31/0x130 [rds_rdma] RSP: 0018:ffff88046fa43bd8 EFLAGS: 00010286 RAX: 0000000071d38b80 RBX: 0000000000000000 RCX: 0000000000000000 RDX: 0000000000000001 RSI: 0000000000000000 RDI: ffff880079e7ff40 RBP: ffff88046fa43bf8 R08: 0000000000000000 R09: 0000000000000000 R10: ffff88046fa43ca8 R11: ffff88046a802ed8 R12: ffff880079e7fa40 R13: 0000000000000000 R14: ffff880079e7ff40 R15: 0000000000000000 FS: 0000000000000000(0000) GS:ffff88046fa40000(0000) knlGS:0000000000000000 CS: 0010 DS: 0000 ES: 0000 CR0: 000000008005003b CR2: 0000000000000104 CR3: 00000004366fb000 CR4: 00000000000006e0 Stack: ffff880079e7fa40 ffff880671d38f08 ffff880079e7ff40 0000000000000296 ffff88046fa43c28 ffffffffa087a38b ffff880079e7fa40 ffff880671d38f10 0000000000000000 0000000000000292 ffff88046fa43c48 ffffffffa087a3b6 Call Trace: [] rds_destroy_mr+0x8b/0xa0 [rds] [] __rds_put_mr_final+0x16/0x30 [rds] [] rds_rdma_unuse+0xc2/0x120 [rds] [] rds_recv_incoming_exthdrs+0x83/0xa0 [rds] [] rds_recv_incoming+0x92/0x200 [rds] [] rds_ib_process_recv+0x259/0x320 [rds_rdma] [] rds_ib_recv_tasklet_fn+0x1a8/0x490 [rds_rdma] [] ? __remove_hrtimer+0x58/0x90 [] tasklet_action+0xb1/0xc0 [] __do_softirq+0xe2/0x290 [] irq_exit+0xa6/0xb0 [] do_IRQ+0x65/0xf0 [] common_interrupt+0x6b/0x6b Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/rdma.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/net/rds/rdma.c b/net/rds/rdma.c index c1df9b1cf3b2..4c93badeabf2 100644 --- a/net/rds/rdma.c +++ b/net/rds/rdma.c @@ -435,9 +435,10 @@ void rds_rdma_unuse(struct rds_sock *rs, u32 r_key, int force) /* If the MR was marked as invalidate, this will * trigger an async flush. */ - if (zot_me) + if (zot_me) { rds_destroy_mr(mr); - rds_mr_put(mr); + rds_mr_put(mr); + } } void rds_rdma_free_op(struct rm_rdma_op *ro) From 6116c2030fff91950f68b7fffb5959c91a05aaf6 Mon Sep 17 00:00:00 2001 From: Wengang Wang Date: Tue, 25 Aug 2015 12:02:00 -0700 Subject: [PATCH 3/6] RDS: fix fmr pool dirty_count In rds_ib_flush_mr_pool(), dirty_count accounts the clean ones which is wrong. This can lead to a negative dirty count value. Lets fix it. Signed-off-by: Wengang Wang Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 15 +++++++++------ 1 file changed, 9 insertions(+), 6 deletions(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 7b7aac8cdb56..a275b7d205ef 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -528,11 +528,13 @@ static inline unsigned int rds_ib_flush_goal(struct rds_ib_mr_pool *pool, int fr /* * given an llist of mrs, put them all into the list_head for more processing */ -static void llist_append_to_list(struct llist_head *llist, struct list_head *list) +static unsigned int llist_append_to_list(struct llist_head *llist, + struct list_head *list) { struct rds_ib_mr *ibmr; struct llist_node *node; struct llist_node *next; + unsigned int count = 0; node = llist_del_all(llist); while (node) { @@ -540,7 +542,9 @@ static void llist_append_to_list(struct llist_head *llist, struct list_head *lis ibmr = llist_entry(node, struct rds_ib_mr, llnode); list_add_tail(&ibmr->unmap_list, list); node = next; + count++; } + return count; } /* @@ -581,7 +585,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, LIST_HEAD(unmap_list); LIST_HEAD(fmr_list); unsigned long unpinned = 0; - unsigned int nfreed = 0, ncleaned = 0, free_goal; + unsigned int nfreed = 0, dirty_to_clean = 0, free_goal; int ret = 0; rds_ib_stats_inc(s_ib_rdma_mr_pool_flush); @@ -623,8 +627,8 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, /* Get the list of all MRs to be dropped. Ordering matters - * we want to put drop_list ahead of free_list. */ - llist_append_to_list(&pool->drop_list, &unmap_list); - llist_append_to_list(&pool->free_list, &unmap_list); + dirty_to_clean = llist_append_to_list(&pool->drop_list, &unmap_list); + dirty_to_clean += llist_append_to_list(&pool->free_list, &unmap_list); if (free_all) llist_append_to_list(&pool->clean_list, &unmap_list); @@ -652,7 +656,6 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, kfree(ibmr); nfreed++; } - ncleaned++; } if (!list_empty(&unmap_list)) { @@ -678,7 +681,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, } atomic_sub(unpinned, &pool->free_pinned); - atomic_sub(ncleaned, &pool->dirty_count); + atomic_sub(dirty_to_clean, &pool->dirty_count); atomic_sub(nfreed, &pool->item_count); out: From ad1d7dc0d79d3dd2c5d2931b13edbd4fe33e5fac Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:02:01 -0700 Subject: [PATCH 4/6] RDS: push FMR pool flush work to its own worker RDS FMR flush operation and also it races with connect/reconect which happes a lot with RDS. FMR flush being on common rds_wq aggrevates the problem. Lets push RDS FMR pool flush work to its own worker. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib.c | 9 ++++++++- net/rds/ib.h | 2 ++ net/rds/ib_rdma.c | 27 ++++++++++++++++++++++++--- 3 files changed, 34 insertions(+), 4 deletions(-) diff --git a/net/rds/ib.c b/net/rds/ib.c index 13814227b3b2..d020fade312c 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -366,6 +366,7 @@ void rds_ib_exit(void) rds_ib_sysctl_exit(); rds_ib_recv_exit(); rds_trans_unregister(&rds_ib_transport); + rds_ib_fmr_exit(); } struct rds_transport rds_ib_transport = { @@ -401,10 +402,14 @@ int rds_ib_init(void) INIT_LIST_HEAD(&rds_ib_devices); - ret = ib_register_client(&rds_ib_client); + ret = rds_ib_fmr_init(); if (ret) goto out; + ret = ib_register_client(&rds_ib_client); + if (ret) + goto out_fmr_exit; + ret = rds_ib_sysctl_init(); if (ret) goto out_ibreg; @@ -427,6 +432,8 @@ int rds_ib_init(void) rds_ib_sysctl_exit(); out_ibreg: rds_ib_unregister_client(); +out_fmr_exit: + rds_ib_fmr_exit(); out: return ret; } diff --git a/net/rds/ib.h b/net/rds/ib.h index 6422c52682e5..9fc95e38659a 100644 --- a/net/rds/ib.h +++ b/net/rds/ib.h @@ -313,6 +313,8 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents, void rds_ib_sync_mr(void *trans_private, int dir); void rds_ib_free_mr(void *trans_private, int invalidate); void rds_ib_flush_mrs(void); +int rds_ib_fmr_init(void); +void rds_ib_fmr_exit(void); /* ib_recv.c */ int rds_ib_recv_init(void); diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index a275b7d205ef..2ac78c9879ea 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -83,6 +83,25 @@ struct rds_ib_mr_pool { struct ib_fmr_attr fmr_attr; }; +struct workqueue_struct *rds_ib_fmr_wq; + +int rds_ib_fmr_init(void) +{ + rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd"); + if (!rds_ib_fmr_wq) + return -ENOMEM; + return 0; +} + +/* By the time this is called all the IB devices should have been torn down and + * had their pools freed. As each pool is freed its work struct is waited on, + * so the pool flushing work queue should be idle by the time we get here. + */ +void rds_ib_fmr_exit(void) +{ + destroy_workqueue(rds_ib_fmr_wq); +} + static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **); static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr); static void rds_ib_mr_pool_flush_worker(struct work_struct *work); @@ -719,15 +738,17 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || atomic_read(&pool->dirty_count) >= pool->max_items / 10) - schedule_delayed_work(&pool->flush_worker, 10); + queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { if (likely(!in_interrupt())) { rds_ib_flush_mr_pool(pool, 0, NULL); } else { /* We get here if the user created a MR marked - * as use_once and invalidate at the same time. */ - schedule_delayed_work(&pool->flush_worker, 10); + * as use_once and invalidate at the same time. + */ + queue_delayed_work(rds_ib_fmr_wq, + &pool->flush_worker, 10); } } From ef5217a6e2e60bc3d0679f2652480b99730956fe Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:02:02 -0700 Subject: [PATCH 5/6] RDS: flush the FMR pool less often FMR flush is an expensive and time consuming operation. Reduce the frequency of FMR pool flush by 50% so that more FMR work gets accumulated for more efficient flushing. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index 2ac78c9879ea..e596dfb76038 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -737,7 +737,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate) /* If we've pinned too many pages, request a flush */ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned || - atomic_read(&pool->dirty_count) >= pool->max_items / 10) + atomic_read(&pool->dirty_count) >= pool->max_items / 5) queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10); if (invalidate) { From 272412141908c40517cc89d5bb2eb074a2ec1474 Mon Sep 17 00:00:00 2001 From: "santosh.shilimkar@oracle.com" Date: Tue, 25 Aug 2015 12:02:03 -0700 Subject: [PATCH 6/6] RDS: remove superfluous from rds_ib_alloc_fmr() Memory allocated for 'ibmr' uses kzalloc_node() which already initialises the memory to zero. There is no need to do memset() 0 on that memory. Signed-off-by: Santosh Shilimkar Signed-off-by: Santosh Shilimkar Signed-off-by: David S. Miller --- net/rds/ib_rdma.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c index e596dfb76038..251d1ce0b7c7 100644 --- a/net/rds/ib_rdma.c +++ b/net/rds/ib_rdma.c @@ -360,8 +360,6 @@ static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev) goto out_no_cigar; } - memset(ibmr, 0, sizeof(*ibmr)); - ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd, (IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ |