block-6.17-20250815

-----BEGIN PGP SIGNATURE-----
 
 iQJEBAABCAAuFiEEwPw5LcreJtl1+l5K99NY+ylx4KYFAmifSywQHGF4Ym9lQGtl
 cm5lbC5kawAKCRD301j7KXHgpo6lD/0T1HlfvcG2kVfk35pcn1EaVxX5li1nuRTa
 afUmnVHeOe8C5Fcl9uwc7oBcdJUGldWcE3ARBHorMhmougy9hVj5T9qUKzP6w87U
 ZVMU63gDJ13Uz0hv3nl6UDSv2P0Sgk+zb9qEpJx/XiOVtaw9rERGNpLNjjASU65v
 c+BdRO4dwIrWUEz/dWU8AMADacjyFd1fBLvLxUQAWJ2Jo9voup4KnywDw5AuoTSw
 wmXPYYoyaPtciNontQz+s6lFX4qCJJijnmiS4W7jF//xxbMS5MxSQzKVhdH5Ax0w
 hpMO2Zb7UsF3zPiNiwfrtukhK68GJOJPmmOFkVPjaLU15kme84mDKE7Ad6hmiJXy
 nDaMbt7+NLvV3Gn4TEZxyiqwA4MPOoqdmwg7rFPD4k6oCQ9Z+lOnKSd+gGNX2Mn5
 fSjTAGFRXygfJ7mshGjRSqPEzqa+uPGMg+32gEUcE7Dx3U0bvFs4jZSqgEAjEAqR
 e62zJtlYKgwaj+1lHG5+GMq1guU//H8IndSFGPMRKUBky1AIaLZZ8LiQbqEdnAWW
 nTuPPUth5l2AzdmVjiBRzlOoiH8vhf8Mh5n+QPHP/olG9AKUmpYWquMzDMHKnZtt
 6SOvnD8RDUgcojgdDPHXHpWEclNEw31b+jjZaFC+qcwnGDqAhEFy4qZJNFqqsiDl
 W2nSJrsLtw==
 =JNGm
 -----END PGP SIGNATURE-----

Merge tag 'block-6.17-20250815' of git://git.kernel.dk/linux

Pull block fixes from Jens Axboe:

 - Fix for unprivileged daemons in ublk

 - Speedup ublk release by removing unnecessary quiesce

 - Fix for blk-wbt, where a regression caused it to not be possible to
   enable at runtime

 - blk-wbt cleanups

 - Kill the page pool from drbd

 - Remove redundant __GFP_NOWARN uses in a few spots

 - Fix for a kobject double initialization issues

* tag 'block-6.17-20250815' of git://git.kernel.dk/linux:
  block: restore default wbt enablement
  Docs: admin-guide: Correct spelling mistake
  blk-wbt: doc: Update the doc of the wbt_lat_usec interface
  blk-wbt: Eliminate ambiguity in the comments of struct rq_wb
  blk-wbt: Optimize wbt_done() for non-throttled writes
  block: fix kobject double initialization in add_disk
  blk-cgroup: remove redundant __GFP_NOWARN
  block, bfq: remove redundant __GFP_NOWARN
  ublk: check for unprivileged daemon on each I/O fetch
  ublk: don't quiesce in ublk_ch_release
  drbd: Remove the open-coded page pool
This commit is contained in:
Linus Torvalds 2025-08-15 08:20:36 -07:00
commit ee94b00c1a
13 changed files with 105 additions and 384 deletions

View File

@ -731,7 +731,7 @@ Contact: linux-block@vger.kernel.org
Description:
[RW] If the device is registered for writeback throttling, then
this file shows the target minimum read latency. If this latency
is exceeded in a given window of time (see wb_window_usec), then
is exceeded in a given window of time (see curr_win_nsec), then
the writeback throttling will start scaling back writes. Writing
a value of '0' to this file disables the feature. Writing a
value of '-1' to this file resets the value to the default

View File

@ -79,7 +79,7 @@ zone_capacity_mb Device zone capacity (must always be equal to or lower than
the zone size. Default: zone size.
conv_zones Total number of conventioanl zones starting from sector 0.
Default: 8.
base_dir Path to the base directoy where to create the directory
base_dir Path to the base directory where to create the directory
containing the zone files of the device.
Default=/var/local/zloop.
The device directory containing the zone files is always

View File

@ -5847,8 +5847,7 @@ static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd,
goto out;
}
bfqq = kmem_cache_alloc_node(bfq_pool,
GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN,
bfqq = kmem_cache_alloc_node(bfq_pool, GFP_NOWAIT | __GFP_ZERO,
bfqd->queue->node);
if (bfqq) {

View File

@ -394,7 +394,7 @@ static struct blkcg_gq *blkg_create(struct blkcg *blkcg, struct gendisk *disk,
/* allocate */
if (!new_blkg) {
new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT | __GFP_NOWARN);
new_blkg = blkg_alloc(blkcg, disk, GFP_NOWAIT);
if (unlikely(!new_blkg)) {
ret = -ENOMEM;
goto err_put_css;
@ -1467,7 +1467,7 @@ blkcg_css_alloc(struct cgroup_subsys_state *parent_css)
spin_lock_init(&blkcg->lock);
refcount_set(&blkcg->online_pin, 1);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT | __GFP_NOWARN);
INIT_RADIX_TREE(&blkcg->blkg_tree, GFP_NOWAIT);
INIT_HLIST_HEAD(&blkcg->blkg_list);
#ifdef CONFIG_CGROUP_WRITEBACK
INIT_LIST_HEAD(&blkcg->cgwb_list);
@ -1630,7 +1630,7 @@ int blkcg_activate_policy(struct gendisk *disk, const struct blkcg_policy *pol)
pd_prealloc = NULL;
} else {
pd = pol->pd_alloc_fn(disk, blkg->blkcg,
GFP_NOWAIT | __GFP_NOWARN);
GFP_NOWAIT);
}
if (!pd) {

View File

@ -847,7 +847,7 @@ static void blk_queue_release(struct kobject *kobj)
/* nothing to do here, all data is associated with the parent gendisk */
}
static const struct kobj_type blk_queue_ktype = {
const struct kobj_type blk_queue_ktype = {
.default_groups = blk_queue_attr_groups,
.sysfs_ops = &queue_sysfs_ops,
.release = blk_queue_release,
@ -875,15 +875,14 @@ int blk_register_queue(struct gendisk *disk)
struct request_queue *q = disk->queue;
int ret;
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
ret = kobject_add(&disk->queue_kobj, &disk_to_dev(disk)->kobj, "queue");
if (ret < 0)
goto out_put_queue_kobj;
return ret;
if (queue_is_mq(q)) {
ret = blk_mq_sysfs_register(disk);
if (ret)
goto out_put_queue_kobj;
goto out_del_queue_kobj;
}
mutex_lock(&q->sysfs_lock);
@ -903,9 +902,9 @@ int blk_register_queue(struct gendisk *disk)
if (queue_is_mq(q))
elevator_set_default(q);
wbt_enable_default(disk);
blk_queue_flag_set(QUEUE_FLAG_REGISTERED, q);
wbt_enable_default(disk);
/* Now everything is ready and send out KOBJ_ADD uevent */
kobject_uevent(&disk->queue_kobj, KOBJ_ADD);
@ -934,8 +933,8 @@ int blk_register_queue(struct gendisk *disk)
mutex_unlock(&q->sysfs_lock);
if (queue_is_mq(q))
blk_mq_sysfs_unregister(disk);
out_put_queue_kobj:
kobject_put(&disk->queue_kobj);
out_del_queue_kobj:
kobject_del(&disk->queue_kobj);
return ret;
}
@ -986,5 +985,4 @@ void blk_unregister_queue(struct gendisk *disk)
elevator_set_none(q);
blk_debugfs_remove(disk);
kobject_put(&disk->queue_kobj);
}

View File

@ -85,8 +85,8 @@ struct rq_wb {
u64 sync_issue;
void *sync_cookie;
unsigned long last_issue; /* last non-throttled issue */
unsigned long last_comp; /* last non-throttled comp */
unsigned long last_issue; /* issue time of last read rq */
unsigned long last_comp; /* completion time of last read rq */
unsigned long min_lat_nsec;
struct rq_qos rqos;
struct rq_wait rq_wait[WBT_NUM_RWQ];
@ -248,13 +248,14 @@ static void wbt_done(struct rq_qos *rqos, struct request *rq)
struct rq_wb *rwb = RQWB(rqos);
if (!wbt_is_tracked(rq)) {
if (rwb->sync_cookie == rq) {
rwb->sync_issue = 0;
rwb->sync_cookie = NULL;
}
if (wbt_is_read(rq)) {
if (rwb->sync_cookie == rq) {
rwb->sync_issue = 0;
rwb->sync_cookie = NULL;
}
if (wbt_is_read(rq))
wb_timestamp(rwb, &rwb->last_comp);
}
} else {
WARN_ON_ONCE(rq == rwb->sync_cookie);
__wbt_done(rqos, wbt_flags(rq));

View File

@ -29,6 +29,7 @@ struct elevator_tags;
/* Max future timer expiry for timeouts */
#define BLK_MAX_TIMEOUT (5 * HZ)
extern const struct kobj_type blk_queue_ktype;
extern struct dentry *blk_debugfs_root;
struct blk_flush_queue {

View File

@ -1303,6 +1303,7 @@ static void disk_release(struct device *dev)
disk_free_zone_resources(disk);
xa_destroy(&disk->part_tbl);
kobject_put(&disk->queue_kobj);
disk->queue->disk = NULL;
blk_put_queue(disk->queue);
@ -1486,6 +1487,7 @@ struct gendisk *__alloc_disk_node(struct request_queue *q, int node_id,
INIT_LIST_HEAD(&disk->slave_bdevs);
#endif
mutex_init(&disk->rqos_state_mutex);
kobject_init(&disk->queue_kobj, &blk_queue_ktype);
return disk;
out_erase_part0:

View File

@ -380,6 +380,9 @@ enum {
/* this is/was a write request */
__EE_WRITE,
/* hand back using mempool_free(e, drbd_buffer_page_pool) */
__EE_RELEASE_TO_MEMPOOL,
/* this is/was a write same request */
__EE_WRITE_SAME,
@ -402,6 +405,7 @@ enum {
#define EE_IN_INTERVAL_TREE (1<<__EE_IN_INTERVAL_TREE)
#define EE_SUBMITTED (1<<__EE_SUBMITTED)
#define EE_WRITE (1<<__EE_WRITE)
#define EE_RELEASE_TO_MEMPOOL (1<<__EE_RELEASE_TO_MEMPOOL)
#define EE_WRITE_SAME (1<<__EE_WRITE_SAME)
#define EE_APPLICATION (1<<__EE_APPLICATION)
#define EE_RS_THIN_REQ (1<<__EE_RS_THIN_REQ)
@ -858,7 +862,6 @@ struct drbd_device {
struct list_head sync_ee; /* IO in progress (P_RS_DATA_REPLY gets written to disk) */
struct list_head done_ee; /* need to send P_WRITE_ACK */
struct list_head read_ee; /* [RS]P_DATA_REQUEST being read */
struct list_head net_ee; /* zero-copy network send in progress */
struct list_head resync_reads;
atomic_t pp_in_use; /* allocated from page pool */
@ -1329,24 +1332,6 @@ extern struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
extern mempool_t drbd_request_mempool;
extern mempool_t drbd_ee_mempool;
/* drbd's page pool, used to buffer data received from the peer,
* or data requested by the peer.
*
* This does not have an emergency reserve.
*
* When allocating from this pool, it first takes pages from the pool.
* Only if the pool is depleted will try to allocate from the system.
*
* The assumption is that pages taken from this pool will be processed,
* and given back, "quickly", and then can be recycled, so we can avoid
* frequent calls to alloc_page(), and still will be able to make progress even
* under memory pressure.
*/
extern struct page *drbd_pp_pool;
extern spinlock_t drbd_pp_lock;
extern int drbd_pp_vacant;
extern wait_queue_head_t drbd_pp_wait;
/* We also need a standard (emergency-reserve backed) page pool
* for meta data IO (activity log, bitmap).
* We can keep it global, as long as it is used as "N pages at a time".
@ -1354,6 +1339,7 @@ extern wait_queue_head_t drbd_pp_wait;
*/
#define DRBD_MIN_POOL_PAGES 128
extern mempool_t drbd_md_io_page_pool;
extern mempool_t drbd_buffer_page_pool;
/* We also need to make sure we get a bio
* when we need it for housekeeping purposes */
@ -1488,10 +1474,7 @@ extern struct drbd_peer_request *drbd_alloc_peer_req(struct drbd_peer_device *,
sector_t, unsigned int,
unsigned int,
gfp_t) __must_hold(local);
extern void __drbd_free_peer_req(struct drbd_device *, struct drbd_peer_request *,
int);
#define drbd_free_peer_req(m,e) __drbd_free_peer_req(m, e, 0)
#define drbd_free_net_peer_req(m,e) __drbd_free_peer_req(m, e, 1)
extern void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *req);
extern struct page *drbd_alloc_pages(struct drbd_peer_device *, unsigned int, bool);
extern void _drbd_clear_done_ee(struct drbd_device *device, struct list_head *to_be_freed);
extern int drbd_connected(struct drbd_peer_device *);
@ -1610,16 +1593,6 @@ static inline struct page *page_chain_next(struct page *page)
for (; page && ({ n = page_chain_next(page); 1; }); page = n)
static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_req)
{
struct page *page = peer_req->pages;
page_chain_for_each(page) {
if (page_count(page) > 1)
return 1;
}
return 0;
}
static inline union drbd_state drbd_read_state(struct drbd_device *device)
{
struct drbd_resource *resource = device->resource;

View File

@ -114,20 +114,10 @@ struct kmem_cache *drbd_al_ext_cache; /* activity log extents */
mempool_t drbd_request_mempool;
mempool_t drbd_ee_mempool;
mempool_t drbd_md_io_page_pool;
mempool_t drbd_buffer_page_pool;
struct bio_set drbd_md_io_bio_set;
struct bio_set drbd_io_bio_set;
/* I do not use a standard mempool, because:
1) I want to hand out the pre-allocated objects first.
2) I want to be able to interrupt sleeping allocation with a signal.
Note: This is a single linked list, the next pointer is the private
member of struct page.
*/
struct page *drbd_pp_pool;
DEFINE_SPINLOCK(drbd_pp_lock);
int drbd_pp_vacant;
wait_queue_head_t drbd_pp_wait;
DEFINE_RATELIMIT_STATE(drbd_ratelimit_state, 5 * HZ, 5);
static const struct block_device_operations drbd_ops = {
@ -1611,6 +1601,7 @@ static int _drbd_send_zc_bio(struct drbd_peer_device *peer_device, struct bio *b
static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
struct drbd_peer_request *peer_req)
{
bool use_sendpage = !(peer_req->flags & EE_RELEASE_TO_MEMPOOL);
struct page *page = peer_req->pages;
unsigned len = peer_req->i.size;
int err;
@ -1619,8 +1610,13 @@ static int _drbd_send_zc_ee(struct drbd_peer_device *peer_device,
page_chain_for_each(page) {
unsigned l = min_t(unsigned, len, PAGE_SIZE);
err = _drbd_send_page(peer_device, page, 0, l,
page_chain_next(page) ? MSG_MORE : 0);
if (likely(use_sendpage))
err = _drbd_send_page(peer_device, page, 0, l,
page_chain_next(page) ? MSG_MORE : 0);
else
err = _drbd_no_send_page(peer_device, page, 0, l,
page_chain_next(page) ? MSG_MORE : 0);
if (err)
return err;
len -= l;
@ -1962,7 +1958,6 @@ void drbd_init_set_defaults(struct drbd_device *device)
INIT_LIST_HEAD(&device->sync_ee);
INIT_LIST_HEAD(&device->done_ee);
INIT_LIST_HEAD(&device->read_ee);
INIT_LIST_HEAD(&device->net_ee);
INIT_LIST_HEAD(&device->resync_reads);
INIT_LIST_HEAD(&device->resync_work.list);
INIT_LIST_HEAD(&device->unplug_work.list);
@ -2043,7 +2038,6 @@ void drbd_device_cleanup(struct drbd_device *device)
D_ASSERT(device, list_empty(&device->sync_ee));
D_ASSERT(device, list_empty(&device->done_ee));
D_ASSERT(device, list_empty(&device->read_ee));
D_ASSERT(device, list_empty(&device->net_ee));
D_ASSERT(device, list_empty(&device->resync_reads));
D_ASSERT(device, list_empty(&first_peer_device(device)->connection->sender_work.q));
D_ASSERT(device, list_empty(&device->resync_work.list));
@ -2055,19 +2049,11 @@ void drbd_device_cleanup(struct drbd_device *device)
static void drbd_destroy_mempools(void)
{
struct page *page;
while (drbd_pp_pool) {
page = drbd_pp_pool;
drbd_pp_pool = (struct page *)page_private(page);
__free_page(page);
drbd_pp_vacant--;
}
/* D_ASSERT(device, atomic_read(&drbd_pp_vacant)==0); */
bioset_exit(&drbd_io_bio_set);
bioset_exit(&drbd_md_io_bio_set);
mempool_exit(&drbd_buffer_page_pool);
mempool_exit(&drbd_md_io_page_pool);
mempool_exit(&drbd_ee_mempool);
mempool_exit(&drbd_request_mempool);
@ -2086,9 +2072,8 @@ static void drbd_destroy_mempools(void)
static int drbd_create_mempools(void)
{
struct page *page;
const int number = (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count;
int i, ret;
int ret;
/* caches */
drbd_request_cache = kmem_cache_create(
@ -2125,6 +2110,10 @@ static int drbd_create_mempools(void)
if (ret)
goto Enomem;
ret = mempool_init_page_pool(&drbd_buffer_page_pool, number, 0);
if (ret)
goto Enomem;
ret = mempool_init_slab_pool(&drbd_request_mempool, number,
drbd_request_cache);
if (ret)
@ -2134,15 +2123,6 @@ static int drbd_create_mempools(void)
if (ret)
goto Enomem;
for (i = 0; i < number; i++) {
page = alloc_page(GFP_HIGHUSER);
if (!page)
goto Enomem;
set_page_private(page, (unsigned long)drbd_pp_pool);
drbd_pp_pool = page;
}
drbd_pp_vacant = number;
return 0;
Enomem:
@ -2169,10 +2149,6 @@ static void drbd_release_all_peer_reqs(struct drbd_device *device)
rr = drbd_free_peer_reqs(device, &device->done_ee);
if (rr)
drbd_err(device, "%d EEs in done list found!\n", rr);
rr = drbd_free_peer_reqs(device, &device->net_ee);
if (rr)
drbd_err(device, "%d EEs in net list found!\n", rr);
}
/* caution. no locking. */
@ -2863,11 +2839,6 @@ static int __init drbd_init(void)
return err;
}
/*
* allocate all necessary structs
*/
init_waitqueue_head(&drbd_pp_wait);
drbd_proc = NULL; /* play safe for drbd_cleanup */
idr_init(&drbd_devices);

View File

@ -33,6 +33,7 @@
#include <linux/string.h>
#include <linux/scatterlist.h>
#include <linux/part_stat.h>
#include <linux/mempool.h>
#include "drbd_int.h"
#include "drbd_protocol.h"
#include "drbd_req.h"
@ -63,182 +64,31 @@ static int e_end_block(struct drbd_work *, int);
#define GFP_TRY (__GFP_HIGHMEM | __GFP_NOWARN)
/*
* some helper functions to deal with single linked page lists,
* page->private being our "next" pointer.
*/
/* If at least n pages are linked at head, get n pages off.
* Otherwise, don't modify head, and return NULL.
* Locking is the responsibility of the caller.
*/
static struct page *page_chain_del(struct page **head, int n)
{
struct page *page;
struct page *tmp;
BUG_ON(!n);
BUG_ON(!head);
page = *head;
if (!page)
return NULL;
while (page) {
tmp = page_chain_next(page);
if (--n == 0)
break; /* found sufficient pages */
if (tmp == NULL)
/* insufficient pages, don't use any of them. */
return NULL;
page = tmp;
}
/* add end of list marker for the returned list */
set_page_private(page, 0);
/* actual return value, and adjustment of head */
page = *head;
*head = tmp;
return page;
}
/* may be used outside of locks to find the tail of a (usually short)
* "private" page chain, before adding it back to a global chain head
* with page_chain_add() under a spinlock. */
static struct page *page_chain_tail(struct page *page, int *len)
{
struct page *tmp;
int i = 1;
while ((tmp = page_chain_next(page))) {
++i;
page = tmp;
}
if (len)
*len = i;
return page;
}
static int page_chain_free(struct page *page)
{
struct page *tmp;
int i = 0;
page_chain_for_each_safe(page, tmp) {
put_page(page);
++i;
}
return i;
}
static void page_chain_add(struct page **head,
struct page *chain_first, struct page *chain_last)
{
#if 1
struct page *tmp;
tmp = page_chain_tail(chain_first, NULL);
BUG_ON(tmp != chain_last);
#endif
/* add chain to head */
set_page_private(chain_last, (unsigned long)*head);
*head = chain_first;
}
static struct page *__drbd_alloc_pages(struct drbd_device *device,
unsigned int number)
static struct page *__drbd_alloc_pages(unsigned int number)
{
struct page *page = NULL;
struct page *tmp = NULL;
unsigned int i = 0;
/* Yes, testing drbd_pp_vacant outside the lock is racy.
* So what. It saves a spin_lock. */
if (drbd_pp_vacant >= number) {
spin_lock(&drbd_pp_lock);
page = page_chain_del(&drbd_pp_pool, number);
if (page)
drbd_pp_vacant -= number;
spin_unlock(&drbd_pp_lock);
if (page)
return page;
}
/* GFP_TRY, because we must not cause arbitrary write-out: in a DRBD
* "criss-cross" setup, that might cause write-out on some other DRBD,
* which in turn might block on the other node at this very place. */
for (i = 0; i < number; i++) {
tmp = alloc_page(GFP_TRY);
tmp = mempool_alloc(&drbd_buffer_page_pool, GFP_TRY);
if (!tmp)
break;
goto fail;
set_page_private(tmp, (unsigned long)page);
page = tmp;
}
if (i == number)
return page;
/* Not enough pages immediately available this time.
* No need to jump around here, drbd_alloc_pages will retry this
* function "soon". */
if (page) {
tmp = page_chain_tail(page, NULL);
spin_lock(&drbd_pp_lock);
page_chain_add(&drbd_pp_pool, page, tmp);
drbd_pp_vacant += i;
spin_unlock(&drbd_pp_lock);
return page;
fail:
page_chain_for_each_safe(page, tmp) {
set_page_private(page, 0);
mempool_free(page, &drbd_buffer_page_pool);
}
return NULL;
}
static void reclaim_finished_net_peer_reqs(struct drbd_device *device,
struct list_head *to_be_freed)
{
struct drbd_peer_request *peer_req, *tmp;
/* The EEs are always appended to the end of the list. Since
they are sent in order over the wire, they have to finish
in order. As soon as we see the first not finished we can
stop to examine the list... */
list_for_each_entry_safe(peer_req, tmp, &device->net_ee, w.list) {
if (drbd_peer_req_has_active_page(peer_req))
break;
list_move(&peer_req->w.list, to_be_freed);
}
}
static void drbd_reclaim_net_peer_reqs(struct drbd_device *device)
{
LIST_HEAD(reclaimed);
struct drbd_peer_request *peer_req, *t;
spin_lock_irq(&device->resource->req_lock);
reclaim_finished_net_peer_reqs(device, &reclaimed);
spin_unlock_irq(&device->resource->req_lock);
list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
drbd_free_net_peer_req(device, peer_req);
}
static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection)
{
struct drbd_peer_device *peer_device;
int vnr;
rcu_read_lock();
idr_for_each_entry(&connection->peer_devices, peer_device, vnr) {
struct drbd_device *device = peer_device->device;
if (!atomic_read(&device->pp_in_use_by_net))
continue;
kref_get(&device->kref);
rcu_read_unlock();
drbd_reclaim_net_peer_reqs(device);
kref_put(&device->kref, drbd_destroy_device);
rcu_read_lock();
}
rcu_read_unlock();
}
/**
* drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled)
* @peer_device: DRBD device.
@ -263,9 +113,8 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
bool retry)
{
struct drbd_device *device = peer_device->device;
struct page *page = NULL;
struct page *page;
struct net_conf *nc;
DEFINE_WAIT(wait);
unsigned int mxb;
rcu_read_lock();
@ -273,37 +122,9 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
mxb = nc ? nc->max_buffers : 1000000;
rcu_read_unlock();
if (atomic_read(&device->pp_in_use) < mxb)
page = __drbd_alloc_pages(device, number);
/* Try to keep the fast path fast, but occasionally we need
* to reclaim the pages we lended to the network stack. */
if (page && atomic_read(&device->pp_in_use_by_net) > 512)
drbd_reclaim_net_peer_reqs(device);
while (page == NULL) {
prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE);
drbd_reclaim_net_peer_reqs(device);
if (atomic_read(&device->pp_in_use) < mxb) {
page = __drbd_alloc_pages(device, number);
if (page)
break;
}
if (!retry)
break;
if (signal_pending(current)) {
drbd_warn(device, "drbd_alloc_pages interrupted!\n");
break;
}
if (schedule_timeout(HZ/10) == 0)
mxb = UINT_MAX;
}
finish_wait(&drbd_pp_wait, &wait);
if (atomic_read(&device->pp_in_use) >= mxb)
schedule_timeout_interruptible(HZ / 10);
page = __drbd_alloc_pages(number);
if (page)
atomic_add(number, &device->pp_in_use);
@ -314,29 +135,25 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int
* Is also used from inside an other spin_lock_irq(&resource->req_lock);
* Either links the page chain back to the global pool,
* or returns all pages to the system. */
static void drbd_free_pages(struct drbd_device *device, struct page *page, int is_net)
static void drbd_free_pages(struct drbd_device *device, struct page *page)
{
atomic_t *a = is_net ? &device->pp_in_use_by_net : &device->pp_in_use;
int i;
struct page *tmp;
int i = 0;
if (page == NULL)
return;
if (drbd_pp_vacant > (DRBD_MAX_BIO_SIZE/PAGE_SIZE) * drbd_minor_count)
i = page_chain_free(page);
else {
struct page *tmp;
tmp = page_chain_tail(page, &i);
spin_lock(&drbd_pp_lock);
page_chain_add(&drbd_pp_pool, page, tmp);
drbd_pp_vacant += i;
spin_unlock(&drbd_pp_lock);
page_chain_for_each_safe(page, tmp) {
set_page_private(page, 0);
if (page_count(page) == 1)
mempool_free(page, &drbd_buffer_page_pool);
else
put_page(page);
i++;
}
i = atomic_sub_return(i, a);
i = atomic_sub_return(i, &device->pp_in_use);
if (i < 0)
drbd_warn(device, "ASSERTION FAILED: %s: %d < 0\n",
is_net ? "pp_in_use_by_net" : "pp_in_use", i);
wake_up(&drbd_pp_wait);
drbd_warn(device, "ASSERTION FAILED: pp_in_use: %d < 0\n", i);
}
/*
@ -380,6 +197,8 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
gfpflags_allow_blocking(gfp_mask));
if (!page)
goto fail;
if (!mempool_is_saturated(&drbd_buffer_page_pool))
peer_req->flags |= EE_RELEASE_TO_MEMPOOL;
}
memset(peer_req, 0, sizeof(*peer_req));
@ -403,13 +222,12 @@ drbd_alloc_peer_req(struct drbd_peer_device *peer_device, u64 id, sector_t secto
return NULL;
}
void __drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req,
int is_net)
void drbd_free_peer_req(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
might_sleep();
if (peer_req->flags & EE_HAS_DIGEST)
kfree(peer_req->digest);
drbd_free_pages(device, peer_req->pages, is_net);
drbd_free_pages(device, peer_req->pages);
D_ASSERT(device, atomic_read(&peer_req->pending_bios) == 0);
D_ASSERT(device, drbd_interval_empty(&peer_req->i));
if (!expect(device, !(peer_req->flags & EE_CALL_AL_COMPLETE_IO))) {
@ -424,14 +242,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
LIST_HEAD(work_list);
struct drbd_peer_request *peer_req, *t;
int count = 0;
int is_net = list == &device->net_ee;
spin_lock_irq(&device->resource->req_lock);
list_splice_init(list, &work_list);
spin_unlock_irq(&device->resource->req_lock);
list_for_each_entry_safe(peer_req, t, &work_list, w.list) {
__drbd_free_peer_req(device, peer_req, is_net);
drbd_free_peer_req(device, peer_req);
count++;
}
return count;
@ -443,18 +260,13 @@ int drbd_free_peer_reqs(struct drbd_device *device, struct list_head *list)
static int drbd_finish_peer_reqs(struct drbd_device *device)
{
LIST_HEAD(work_list);
LIST_HEAD(reclaimed);
struct drbd_peer_request *peer_req, *t;
int err = 0;
spin_lock_irq(&device->resource->req_lock);
reclaim_finished_net_peer_reqs(device, &reclaimed);
list_splice_init(&device->done_ee, &work_list);
spin_unlock_irq(&device->resource->req_lock);
list_for_each_entry_safe(peer_req, t, &reclaimed, w.list)
drbd_free_net_peer_req(device, peer_req);
/* possible callbacks here:
* e_end_block, and e_end_resync_block, e_send_superseded.
* all ignore the last argument.
@ -1975,7 +1787,7 @@ static int drbd_drain_block(struct drbd_peer_device *peer_device, int data_size)
data_size -= len;
}
kunmap(page);
drbd_free_pages(peer_device->device, page, 0);
drbd_free_pages(peer_device->device, page);
return err;
}
@ -5224,16 +5036,6 @@ static int drbd_disconnected(struct drbd_peer_device *peer_device)
put_ldev(device);
}
/* tcp_close and release of sendpage pages can be deferred. I don't
* want to use SO_LINGER, because apparently it can be deferred for
* more than 20 seconds (longest time I checked).
*
* Actually we don't care for exactly when the network stack does its
* put_page(), but release our reference on these pages right here.
*/
i = drbd_free_peer_reqs(device, &device->net_ee);
if (i)
drbd_info(device, "net_ee not empty, killed %u entries\n", i);
i = atomic_read(&device->pp_in_use_by_net);
if (i)
drbd_info(device, "pp_in_use_by_net = %d, expected 0\n", i);
@ -5980,8 +5782,6 @@ int drbd_ack_receiver(struct drbd_thread *thi)
while (get_t_state(thi) == RUNNING) {
drbd_thread_current_set_cpu(thi);
conn_reclaim_net_peer_reqs(connection);
if (test_and_clear_bit(SEND_PING, &connection->flags)) {
if (drbd_send_ping(connection)) {
drbd_err(connection, "drbd_send_ping has failed\n");

View File

@ -1030,22 +1030,6 @@ int drbd_resync_finished(struct drbd_peer_device *peer_device)
return 1;
}
/* helper */
static void move_to_net_ee_or_free(struct drbd_device *device, struct drbd_peer_request *peer_req)
{
if (drbd_peer_req_has_active_page(peer_req)) {
/* This might happen if sendpage() has not finished */
int i = PFN_UP(peer_req->i.size);
atomic_add(i, &device->pp_in_use_by_net);
atomic_sub(i, &device->pp_in_use);
spin_lock_irq(&device->resource->req_lock);
list_add_tail(&peer_req->w.list, &device->net_ee);
spin_unlock_irq(&device->resource->req_lock);
wake_up(&drbd_pp_wait);
} else
drbd_free_peer_req(device, peer_req);
}
/**
* w_e_end_data_req() - Worker callback, to send a P_DATA_REPLY packet in response to a P_DATA_REQUEST
* @w: work object.
@ -1059,9 +1043,8 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
int err;
if (unlikely(cancel)) {
drbd_free_peer_req(device, peer_req);
dec_unacked(device);
return 0;
err = 0;
goto out;
}
if (likely((peer_req->flags & EE_WAS_ERROR) == 0)) {
@ -1074,12 +1057,12 @@ int w_e_end_data_req(struct drbd_work *w, int cancel)
err = drbd_send_ack(peer_device, P_NEG_DREPLY, peer_req);
}
dec_unacked(device);
move_to_net_ee_or_free(device, peer_req);
if (unlikely(err))
drbd_err(device, "drbd_send_block() failed\n");
out:
dec_unacked(device);
drbd_free_peer_req(device, peer_req);
return err;
}
@ -1120,9 +1103,8 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
int err;
if (unlikely(cancel)) {
drbd_free_peer_req(device, peer_req);
dec_unacked(device);
return 0;
err = 0;
goto out;
}
if (get_ldev_if_state(device, D_FAILED)) {
@ -1155,13 +1137,12 @@ int w_e_end_rsdata_req(struct drbd_work *w, int cancel)
/* update resync data with failure */
drbd_rs_failed_io(peer_device, peer_req->i.sector, peer_req->i.size);
}
dec_unacked(device);
move_to_net_ee_or_free(device, peer_req);
if (unlikely(err))
drbd_err(device, "drbd_send_block() failed\n");
out:
dec_unacked(device);
drbd_free_peer_req(device, peer_req);
return err;
}
@ -1176,9 +1157,8 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
int err, eq = 0;
if (unlikely(cancel)) {
drbd_free_peer_req(device, peer_req);
dec_unacked(device);
return 0;
err = 0;
goto out;
}
if (get_ldev(device)) {
@ -1220,12 +1200,12 @@ int w_e_end_csum_rs_req(struct drbd_work *w, int cancel)
if (drbd_ratelimit())
drbd_err(device, "Sending NegDReply. I guess it gets messy.\n");
}
dec_unacked(device);
move_to_net_ee_or_free(device, peer_req);
if (unlikely(err))
drbd_err(device, "drbd_send_block/ack() failed\n");
out:
dec_unacked(device);
drbd_free_peer_req(device, peer_req);
return err;
}

View File

@ -235,7 +235,7 @@ struct ublk_device {
struct completion completion;
unsigned int nr_queues_ready;
unsigned int nr_privileged_daemon;
bool unprivileged_daemons;
struct mutex cancel_mutex;
bool canceling;
pid_t ublksrv_tgid;
@ -1389,7 +1389,7 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
{
blk_status_t res;
if (unlikely(ubq->fail_io))
if (unlikely(READ_ONCE(ubq->fail_io)))
return BLK_STS_TARGET;
/* With recovery feature enabled, force_abort is set in
@ -1401,7 +1401,8 @@ static blk_status_t ublk_prep_req(struct ublk_queue *ubq, struct request *rq,
* Note: force_abort is guaranteed to be seen because it is set
* before request queue is unqiuesced.
*/
if (ublk_nosrv_should_queue_io(ubq) && unlikely(ubq->force_abort))
if (ublk_nosrv_should_queue_io(ubq) &&
unlikely(READ_ONCE(ubq->force_abort)))
return BLK_STS_IOERR;
if (check_cancel && unlikely(ubq->canceling))
@ -1550,7 +1551,7 @@ static void ublk_reset_ch_dev(struct ublk_device *ub)
/* set to NULL, otherwise new tasks cannot mmap io_cmd_buf */
ub->mm = NULL;
ub->nr_queues_ready = 0;
ub->nr_privileged_daemon = 0;
ub->unprivileged_daemons = false;
ub->ublksrv_tgid = -1;
}
@ -1644,7 +1645,6 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* Transition the device to the nosrv state. What exactly this
* means depends on the recovery flags
*/
blk_mq_quiesce_queue(disk->queue);
if (ublk_nosrv_should_stop_dev(ub)) {
/*
* Allow any pending/future I/O to pass through quickly
@ -1652,8 +1652,7 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
* waits for all pending I/O to complete
*/
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->force_abort = true;
blk_mq_unquiesce_queue(disk->queue);
WRITE_ONCE(ublk_get_queue(ub, i)->force_abort, true);
ublk_stop_dev_unlocked(ub);
} else {
@ -1663,9 +1662,8 @@ static int ublk_ch_release(struct inode *inode, struct file *filp)
} else {
ub->dev_info.state = UBLK_S_DEV_FAIL_IO;
for (i = 0; i < ub->dev_info.nr_hw_queues; i++)
ublk_get_queue(ub, i)->fail_io = true;
WRITE_ONCE(ublk_get_queue(ub, i)->fail_io, true);
}
blk_mq_unquiesce_queue(disk->queue);
}
unlock:
mutex_unlock(&ub->mutex);
@ -1980,12 +1978,10 @@ static void ublk_mark_io_ready(struct ublk_device *ub, struct ublk_queue *ubq)
__must_hold(&ub->mutex)
{
ubq->nr_io_ready++;
if (ublk_queue_ready(ubq)) {
if (ublk_queue_ready(ubq))
ub->nr_queues_ready++;
if (capable(CAP_SYS_ADMIN))
ub->nr_privileged_daemon++;
}
if (!ub->unprivileged_daemons && !capable(CAP_SYS_ADMIN))
ub->unprivileged_daemons = true;
if (ub->nr_queues_ready == ub->dev_info.nr_hw_queues) {
/* now we are ready for handling ublk io request */
@ -2880,8 +2876,8 @@ static int ublk_ctrl_start_dev(struct ublk_device *ub,
ublk_apply_params(ub);
/* don't probe partitions if any one ubq daemon is un-trusted */
if (ub->nr_privileged_daemon != ub->nr_queues_ready)
/* don't probe partitions if any daemon task is un-trusted */
if (ub->unprivileged_daemons)
set_bit(GD_SUPPRESS_PART_SCAN, &disk->state);
ublk_get_device(ub);