mirror of
https://github.com/torvalds/linux.git
synced 2026-05-12 16:18:45 +02:00
DMEM cgroup pull request
This introduces a new cgroup controller to limit the device memory. Notable users would be DRM, dma-buf heaps, or v4l2. This pull request is based on the series developped by Maarten Lankhorst, Friedrich Vock, and I: https://lore.kernel.org/all/20241204134410.1161769-1-dev@lankhorst.se/ -----BEGIN PGP SIGNATURE----- iJUEABMJAB0WIQTkHFbLp4ejekA/qfgnX84Zoj2+dgUCZ4DhogAKCRAnX84Zoj2+ dj7bAYCnLgOut9i0JawJdrx9wzUV6fpZCt8BGnEDeE0snlKCN/7ETTOtmBqojYrR Hwmts70BfA06NadFtMV+t6QrNOsjkk1JBPvCbSKZJK2KF7qN9Z5s8DTgw1F9rlaM dwCAMDduRQ== =F5bE -----END PGP SIGNATURE----- Merge tag 'cgroup-dmem-drm-v2' of git://git.kernel.org/pub/scm/linux/kernel/git/mripard/linux into drm-next DMEM cgroup pull request This introduces a new cgroup controller to limit the device memory. Notable users would be DRM, dma-buf heaps, or v4l2. This pull request is based on the series developped by Maarten Lankhorst, Friedrich Vock, and I: https://lore.kernel.org/all/20241204134410.1161769-1-dev@lankhorst.se/ Signed-off-by: Dave Airlie <airlied@redhat.com> From: Maxime Ripard <mripard@redhat.com> Link: https://patchwork.freedesktop.org/patch/msgid/20250110-cryptic-warm-mandrill-b71f5d@houat
This commit is contained in:
commit
39388d53c5
|
|
@ -64,13 +64,14 @@ v1 is available under :ref:`Documentation/admin-guide/cgroup-v1/index.rst <cgrou
|
|||
5-6. Device
|
||||
5-7. RDMA
|
||||
5-7-1. RDMA Interface Files
|
||||
5-8. HugeTLB
|
||||
5.8-1. HugeTLB Interface Files
|
||||
5-9. Misc
|
||||
5.9-1 Miscellaneous cgroup Interface Files
|
||||
5.9-2 Migration and Ownership
|
||||
5-10. Others
|
||||
5-10-1. perf_event
|
||||
5-8. DMEM
|
||||
5-9. HugeTLB
|
||||
5.9-1. HugeTLB Interface Files
|
||||
5-10. Misc
|
||||
5.10-1 Miscellaneous cgroup Interface Files
|
||||
5.10-2 Migration and Ownership
|
||||
5-11. Others
|
||||
5-11-1. perf_event
|
||||
5-N. Non-normative information
|
||||
5-N-1. CPU controller root cgroup process behaviour
|
||||
5-N-2. IO controller root cgroup process behaviour
|
||||
|
|
@ -2626,6 +2627,49 @@ RDMA Interface Files
|
|||
mlx4_0 hca_handle=1 hca_object=20
|
||||
ocrdma1 hca_handle=1 hca_object=23
|
||||
|
||||
DMEM
|
||||
----
|
||||
|
||||
The "dmem" controller regulates the distribution and accounting of
|
||||
device memory regions. Because each memory region may have its own page size,
|
||||
which does not have to be equal to the system page size, the units are always bytes.
|
||||
|
||||
DMEM Interface Files
|
||||
~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
dmem.max, dmem.min, dmem.low
|
||||
A readwrite nested-keyed file that exists for all the cgroups
|
||||
except root that describes current configured resource limit
|
||||
for a region.
|
||||
|
||||
An example for xe follows::
|
||||
|
||||
drm/0000:03:00.0/vram0 1073741824
|
||||
drm/0000:03:00.0/stolen max
|
||||
|
||||
The semantics are the same as for the memory cgroup controller, and are
|
||||
calculated in the same way.
|
||||
|
||||
dmem.capacity
|
||||
A read-only file that describes maximum region capacity.
|
||||
It only exists on the root cgroup. Not all memory can be
|
||||
allocated by cgroups, as the kernel reserves some for
|
||||
internal use.
|
||||
|
||||
An example for xe follows::
|
||||
|
||||
drm/0000:03:00.0/vram0 8514437120
|
||||
drm/0000:03:00.0/stolen 67108864
|
||||
|
||||
dmem.current
|
||||
A read-only file that describes current resource usage.
|
||||
It exists for all the cgroup except root.
|
||||
|
||||
An example for xe follows::
|
||||
|
||||
drm/0000:03:00.0/vram0 12550144
|
||||
drm/0000:03:00.0/stolen 8650752
|
||||
|
||||
HugeTLB
|
||||
-------
|
||||
|
||||
|
|
|
|||
9
Documentation/core-api/cgroup.rst
Normal file
9
Documentation/core-api/cgroup.rst
Normal file
|
|
@ -0,0 +1,9 @@
|
|||
==================
|
||||
Cgroup Kernel APIs
|
||||
==================
|
||||
|
||||
Device Memory Cgroup API (dmemcg)
|
||||
=========================
|
||||
.. kernel-doc:: kernel/cgroup/dmem.c
|
||||
:export:
|
||||
|
||||
|
|
@ -109,6 +109,7 @@ more memory-management documentation in Documentation/mm/index.rst.
|
|||
dma-isa-lpc
|
||||
swiotlb
|
||||
mm-api
|
||||
cgroup
|
||||
genalloc
|
||||
pin_user_pages
|
||||
boot-time-mm
|
||||
|
|
|
|||
54
Documentation/gpu/drm-compute.rst
Normal file
54
Documentation/gpu/drm-compute.rst
Normal file
|
|
@ -0,0 +1,54 @@
|
|||
==================================
|
||||
Long running workloads and compute
|
||||
==================================
|
||||
|
||||
Long running workloads (compute) are workloads that will not complete in 10
|
||||
seconds. (The time let the user wait before he reaches for the power button).
|
||||
This means that other techniques need to be used to manage those workloads,
|
||||
that cannot use fences.
|
||||
|
||||
Some hardware may schedule compute jobs, and have no way to pre-empt them, or
|
||||
have their memory swapped out from them. Or they simply want their workload
|
||||
not to be preempted or swapped out at all.
|
||||
|
||||
This means that it differs from what is described in driver-api/dma-buf.rst.
|
||||
|
||||
As with normal compute jobs, dma-fence may not be used at all. In this case,
|
||||
not even to force preemption. The driver with is simply forced to unmap a BO
|
||||
from the long compute job's address space on unbind immediately, not even
|
||||
waiting for the workload to complete. Effectively this terminates the workload
|
||||
when there is no hardware support to recover.
|
||||
|
||||
Since this is undesirable, there need to be mitigations to prevent a workload
|
||||
from being terminated. There are several possible approach, all with their
|
||||
advantages and drawbacks.
|
||||
|
||||
The first approach you will likely try is to pin all buffers used by compute.
|
||||
This guarantees that the job will run uninterrupted, but also allows a very
|
||||
denial of service attack by pinning as much memory as possible, hogging the
|
||||
all GPU memory, and possibly a huge chunk of CPU memory.
|
||||
|
||||
A second approach that will work slightly better on its own is adding an option
|
||||
not to evict when creating a new job (any kind). If all of userspace opts in
|
||||
to this flag, it would prevent cooperating userspace from forced terminating
|
||||
older compute jobs to start a new one.
|
||||
|
||||
If job preemption and recoverable pagefaults are not available, those are the
|
||||
only approaches possible. So even with those, you want a separate way of
|
||||
controlling resources. The standard kernel way of doing so is cgroups.
|
||||
|
||||
This creates a third option, using cgroups to prevent eviction. Both GPU and
|
||||
driver-allocated CPU memory would be accounted to the correct cgroup, and
|
||||
eviction would be made cgroup aware. This allows the GPU to be partitioned
|
||||
into cgroups, that will allow jobs to run next to each other without
|
||||
interference.
|
||||
|
||||
The interface to the cgroup would be similar to the current CPU memory
|
||||
interface, with similar semantics for min/low/high/max, if eviction can
|
||||
be made cgroup aware.
|
||||
|
||||
What should be noted is that each memory region (tiled memory for example)
|
||||
should have its own accounting.
|
||||
|
||||
The key is set to the regionid set by the driver, for example "tile0".
|
||||
For the value of $card, we use drmGetUnique().
|
||||
|
|
@ -26,6 +26,7 @@
|
|||
* DEALINGS IN THE SOFTWARE.
|
||||
*/
|
||||
|
||||
#include <linux/cgroup_dmem.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/fs.h>
|
||||
#include <linux/module.h>
|
||||
|
|
@ -820,6 +821,37 @@ void drm_dev_put(struct drm_device *dev)
|
|||
}
|
||||
EXPORT_SYMBOL(drm_dev_put);
|
||||
|
||||
static void drmm_cg_unregister_region(struct drm_device *dev, void *arg)
|
||||
{
|
||||
dmem_cgroup_unregister_region(arg);
|
||||
}
|
||||
|
||||
/**
|
||||
* drmm_cgroup_register_region - Register a region of a DRM device to cgroups
|
||||
* @dev: device for region
|
||||
* @region_name: Region name for registering
|
||||
* @size: Size of region in bytes
|
||||
*
|
||||
* This decreases the ref-count of @dev by one. The device is destroyed if the
|
||||
* ref-count drops to zero.
|
||||
*/
|
||||
struct dmem_cgroup_region *drmm_cgroup_register_region(struct drm_device *dev, const char *region_name, u64 size)
|
||||
{
|
||||
struct dmem_cgroup_region *region;
|
||||
int ret;
|
||||
|
||||
region = dmem_cgroup_register_region(size, "drm/%s/%s", dev->unique, region_name);
|
||||
if (IS_ERR_OR_NULL(region))
|
||||
return region;
|
||||
|
||||
ret = drmm_add_action_or_reset(dev, drmm_cg_unregister_region, region);
|
||||
if (ret)
|
||||
return ERR_PTR(ret);
|
||||
|
||||
return region;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(drmm_cgroup_register_region);
|
||||
|
||||
static int create_compat_control_link(struct drm_device *dev)
|
||||
{
|
||||
struct drm_minor *minor;
|
||||
|
|
|
|||
|
|
@ -258,13 +258,13 @@ static void ttm_bo_unreserve_basic(struct kunit *test)
|
|||
bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL);
|
||||
bo->priority = bo_prio;
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &res1);
|
||||
err = ttm_resource_alloc(bo, place, &res1, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
|
||||
bo->resource = res1;
|
||||
|
||||
/* Add a dummy resource to populate LRU */
|
||||
ttm_resource_alloc(bo, place, &res2);
|
||||
ttm_resource_alloc(bo, place, &res2, NULL);
|
||||
|
||||
dma_resv_lock(bo->base.resv, NULL);
|
||||
ttm_bo_unreserve(bo);
|
||||
|
|
@ -300,12 +300,12 @@ static void ttm_bo_unreserve_pinned(struct kunit *test)
|
|||
dma_resv_lock(bo->base.resv, NULL);
|
||||
ttm_bo_pin(bo);
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &res1);
|
||||
err = ttm_resource_alloc(bo, place, &res1, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
bo->resource = res1;
|
||||
|
||||
/* Add a dummy resource to the pinned list */
|
||||
err = ttm_resource_alloc(bo, place, &res2);
|
||||
err = ttm_resource_alloc(bo, place, &res2, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
KUNIT_ASSERT_EQ(test,
|
||||
list_is_last(&res2->lru.link, &priv->ttm_dev->unevictable), 1);
|
||||
|
|
@ -355,7 +355,7 @@ static void ttm_bo_unreserve_bulk(struct kunit *test)
|
|||
ttm_bo_set_bulk_move(bo1, &lru_bulk_move);
|
||||
dma_resv_unlock(bo1->base.resv);
|
||||
|
||||
err = ttm_resource_alloc(bo1, place, &res1);
|
||||
err = ttm_resource_alloc(bo1, place, &res1, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
bo1->resource = res1;
|
||||
|
||||
|
|
@ -363,7 +363,7 @@ static void ttm_bo_unreserve_bulk(struct kunit *test)
|
|||
ttm_bo_set_bulk_move(bo2, &lru_bulk_move);
|
||||
dma_resv_unlock(bo2->base.resv);
|
||||
|
||||
err = ttm_resource_alloc(bo2, place, &res2);
|
||||
err = ttm_resource_alloc(bo2, place, &res2, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
bo2->resource = res2;
|
||||
|
||||
|
|
@ -401,7 +401,7 @@ static void ttm_bo_put_basic(struct kunit *test)
|
|||
bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL);
|
||||
bo->type = ttm_bo_type_device;
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &res);
|
||||
err = ttm_resource_alloc(bo, place, &res, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
bo->resource = res;
|
||||
|
||||
|
|
@ -518,7 +518,7 @@ static void ttm_bo_pin_unpin_resource(struct kunit *test)
|
|||
|
||||
bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL);
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &res);
|
||||
err = ttm_resource_alloc(bo, place, &res, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
bo->resource = res;
|
||||
|
||||
|
|
@ -569,7 +569,7 @@ static void ttm_bo_multiple_pin_one_unpin(struct kunit *test)
|
|||
|
||||
bo = ttm_bo_kunit_init(test, test->priv, BO_SIZE, NULL);
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &res);
|
||||
err = ttm_resource_alloc(bo, place, &res, NULL);
|
||||
KUNIT_ASSERT_EQ(test, err, 0);
|
||||
bo->resource = res;
|
||||
|
||||
|
|
|
|||
|
|
@ -542,7 +542,7 @@ static void ttm_bo_validate_no_placement_signaled(struct kunit *test)
|
|||
bo->ttm = old_tt;
|
||||
}
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &bo->resource);
|
||||
err = ttm_resource_alloc(bo, place, &bo->resource, NULL);
|
||||
KUNIT_EXPECT_EQ(test, err, 0);
|
||||
KUNIT_ASSERT_EQ(test, man->usage, size);
|
||||
|
||||
|
|
@ -603,7 +603,7 @@ static void ttm_bo_validate_no_placement_not_signaled(struct kunit *test)
|
|||
bo = ttm_bo_kunit_init(test, test->priv, size, NULL);
|
||||
bo->type = params->bo_type;
|
||||
|
||||
err = ttm_resource_alloc(bo, place, &bo->resource);
|
||||
err = ttm_resource_alloc(bo, place, &bo->resource, NULL);
|
||||
KUNIT_EXPECT_EQ(test, err, 0);
|
||||
|
||||
placement = kunit_kzalloc(test, sizeof(*placement), GFP_KERNEL);
|
||||
|
|
|
|||
|
|
@ -302,7 +302,7 @@ static void ttm_sys_man_free_basic(struct kunit *test)
|
|||
res = kunit_kzalloc(test, sizeof(*res), GFP_KERNEL);
|
||||
KUNIT_ASSERT_NOT_NULL(test, res);
|
||||
|
||||
ttm_resource_alloc(bo, place, &res);
|
||||
ttm_resource_alloc(bo, place, &res, NULL);
|
||||
|
||||
man = ttm_manager_type(priv->devs->ttm_dev, mem_type);
|
||||
man->func->free(man, res);
|
||||
|
|
|
|||
|
|
@ -42,6 +42,7 @@
|
|||
#include <linux/file.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/atomic.h>
|
||||
#include <linux/cgroup_dmem.h>
|
||||
#include <linux/dma-resv.h>
|
||||
|
||||
#include "ttm_module.h"
|
||||
|
|
@ -499,6 +500,13 @@ struct ttm_bo_evict_walk {
|
|||
struct ttm_resource **res;
|
||||
/** @evicted: Number of successful evictions. */
|
||||
unsigned long evicted;
|
||||
|
||||
/** @limit_pool: Which pool limit we should test against */
|
||||
struct dmem_cgroup_pool_state *limit_pool;
|
||||
/** @try_low: Whether we should attempt to evict BO's with low watermark threshold */
|
||||
bool try_low;
|
||||
/** @hit_low: If we cannot evict a bo when @try_low is false (first pass) */
|
||||
bool hit_low;
|
||||
};
|
||||
|
||||
static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo)
|
||||
|
|
@ -507,6 +515,10 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *
|
|||
container_of(walk, typeof(*evict_walk), walk);
|
||||
s64 lret;
|
||||
|
||||
if (!dmem_cgroup_state_evict_valuable(evict_walk->limit_pool, bo->resource->css,
|
||||
evict_walk->try_low, &evict_walk->hit_low))
|
||||
return 0;
|
||||
|
||||
if (bo->pin_count || !bo->bdev->funcs->eviction_valuable(bo, evict_walk->place))
|
||||
return 0;
|
||||
|
||||
|
|
@ -524,7 +536,7 @@ static s64 ttm_bo_evict_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *
|
|||
evict_walk->evicted++;
|
||||
if (evict_walk->res)
|
||||
lret = ttm_resource_alloc(evict_walk->evictor, evict_walk->place,
|
||||
evict_walk->res);
|
||||
evict_walk->res, NULL);
|
||||
if (lret == 0)
|
||||
return 1;
|
||||
out:
|
||||
|
|
@ -545,7 +557,8 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev,
|
|||
struct ttm_buffer_object *evictor,
|
||||
struct ttm_operation_ctx *ctx,
|
||||
struct ww_acquire_ctx *ticket,
|
||||
struct ttm_resource **res)
|
||||
struct ttm_resource **res,
|
||||
struct dmem_cgroup_pool_state *limit_pool)
|
||||
{
|
||||
struct ttm_bo_evict_walk evict_walk = {
|
||||
.walk = {
|
||||
|
|
@ -556,22 +569,39 @@ static int ttm_bo_evict_alloc(struct ttm_device *bdev,
|
|||
.place = place,
|
||||
.evictor = evictor,
|
||||
.res = res,
|
||||
.limit_pool = limit_pool,
|
||||
};
|
||||
s64 lret;
|
||||
|
||||
evict_walk.walk.trylock_only = true;
|
||||
lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1);
|
||||
|
||||
/* One more attempt if we hit low limit? */
|
||||
if (!lret && evict_walk.hit_low) {
|
||||
evict_walk.try_low = true;
|
||||
lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1);
|
||||
}
|
||||
if (lret || !ticket)
|
||||
goto out;
|
||||
|
||||
/* Reset low limit */
|
||||
evict_walk.try_low = evict_walk.hit_low = false;
|
||||
/* If ticket-locking, repeat while making progress. */
|
||||
evict_walk.walk.trylock_only = false;
|
||||
|
||||
retry:
|
||||
do {
|
||||
/* The walk may clear the evict_walk.walk.ticket field */
|
||||
evict_walk.walk.ticket = ticket;
|
||||
evict_walk.evicted = 0;
|
||||
lret = ttm_lru_walk_for_evict(&evict_walk.walk, bdev, man, 1);
|
||||
} while (!lret && evict_walk.evicted);
|
||||
|
||||
/* We hit the low limit? Try once more */
|
||||
if (!lret && evict_walk.hit_low && !evict_walk.try_low) {
|
||||
evict_walk.try_low = true;
|
||||
goto retry;
|
||||
}
|
||||
out:
|
||||
if (lret < 0)
|
||||
return lret;
|
||||
|
|
@ -689,6 +719,7 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo,
|
|||
|
||||
for (i = 0; i < placement->num_placement; ++i) {
|
||||
const struct ttm_place *place = &placement->placement[i];
|
||||
struct dmem_cgroup_pool_state *limit_pool = NULL;
|
||||
struct ttm_resource_manager *man;
|
||||
bool may_evict;
|
||||
|
||||
|
|
@ -701,15 +732,20 @@ static int ttm_bo_alloc_resource(struct ttm_buffer_object *bo,
|
|||
continue;
|
||||
|
||||
may_evict = (force_space && place->mem_type != TTM_PL_SYSTEM);
|
||||
ret = ttm_resource_alloc(bo, place, res);
|
||||
ret = ttm_resource_alloc(bo, place, res, force_space ? &limit_pool : NULL);
|
||||
if (ret) {
|
||||
if (ret != -ENOSPC)
|
||||
if (ret != -ENOSPC && ret != -EAGAIN) {
|
||||
dmem_cgroup_pool_state_put(limit_pool);
|
||||
return ret;
|
||||
if (!may_evict)
|
||||
}
|
||||
if (!may_evict) {
|
||||
dmem_cgroup_pool_state_put(limit_pool);
|
||||
continue;
|
||||
}
|
||||
|
||||
ret = ttm_bo_evict_alloc(bdev, man, place, bo, ctx,
|
||||
ticket, res);
|
||||
ticket, res, limit_pool);
|
||||
dmem_cgroup_pool_state_put(limit_pool);
|
||||
if (ret == -EBUSY)
|
||||
continue;
|
||||
if (ret)
|
||||
|
|
@ -1056,6 +1092,8 @@ struct ttm_bo_swapout_walk {
|
|||
struct ttm_lru_walk walk;
|
||||
/** @gfp_flags: The gfp flags to use for ttm_tt_swapout() */
|
||||
gfp_t gfp_flags;
|
||||
|
||||
bool hit_low, evict_low;
|
||||
};
|
||||
|
||||
static s64
|
||||
|
|
@ -1106,7 +1144,7 @@ ttm_bo_swapout_cb(struct ttm_lru_walk *walk, struct ttm_buffer_object *bo)
|
|||
|
||||
memset(&hop, 0, sizeof(hop));
|
||||
place.mem_type = TTM_PL_SYSTEM;
|
||||
ret = ttm_resource_alloc(bo, &place, &evict_mem);
|
||||
ret = ttm_resource_alloc(bo, &place, &evict_mem, NULL);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
|
|
|||
|
|
@ -26,6 +26,7 @@
|
|||
#include <linux/io-mapping.h>
|
||||
#include <linux/iosys-map.h>
|
||||
#include <linux/scatterlist.h>
|
||||
#include <linux/cgroup_dmem.h>
|
||||
|
||||
#include <drm/ttm/ttm_bo.h>
|
||||
#include <drm/ttm/ttm_placement.h>
|
||||
|
|
@ -350,15 +351,28 @@ EXPORT_SYMBOL(ttm_resource_fini);
|
|||
|
||||
int ttm_resource_alloc(struct ttm_buffer_object *bo,
|
||||
const struct ttm_place *place,
|
||||
struct ttm_resource **res_ptr)
|
||||
struct ttm_resource **res_ptr,
|
||||
struct dmem_cgroup_pool_state **ret_limit_pool)
|
||||
{
|
||||
struct ttm_resource_manager *man =
|
||||
ttm_manager_type(bo->bdev, place->mem_type);
|
||||
struct dmem_cgroup_pool_state *pool = NULL;
|
||||
int ret;
|
||||
|
||||
if (man->cg) {
|
||||
ret = dmem_cgroup_try_charge(man->cg, bo->base.size, &pool, ret_limit_pool);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
|
||||
ret = man->func->alloc(man, bo, place, res_ptr);
|
||||
if (ret)
|
||||
if (ret) {
|
||||
if (pool)
|
||||
dmem_cgroup_uncharge(pool, bo->base.size);
|
||||
return ret;
|
||||
}
|
||||
|
||||
(*res_ptr)->css = pool;
|
||||
|
||||
spin_lock(&bo->bdev->lru_lock);
|
||||
ttm_resource_add_bulk_move(*res_ptr, bo);
|
||||
|
|
@ -370,6 +384,7 @@ EXPORT_SYMBOL_FOR_TESTS_ONLY(ttm_resource_alloc);
|
|||
void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res)
|
||||
{
|
||||
struct ttm_resource_manager *man;
|
||||
struct dmem_cgroup_pool_state *pool;
|
||||
|
||||
if (!*res)
|
||||
return;
|
||||
|
|
@ -377,9 +392,13 @@ void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res)
|
|||
spin_lock(&bo->bdev->lru_lock);
|
||||
ttm_resource_del_bulk_move(*res, bo);
|
||||
spin_unlock(&bo->bdev->lru_lock);
|
||||
|
||||
pool = (*res)->css;
|
||||
man = ttm_manager_type(bo->bdev, (*res)->mem_type);
|
||||
man->func->free(man, *res);
|
||||
*res = NULL;
|
||||
if (man->cg)
|
||||
dmem_cgroup_uncharge(pool, bo->base.size);
|
||||
}
|
||||
EXPORT_SYMBOL(ttm_resource_free);
|
||||
|
||||
|
|
|
|||
|
|
@ -5,6 +5,7 @@
|
|||
*/
|
||||
|
||||
#include <drm/drm_managed.h>
|
||||
#include <drm/drm_drv.h>
|
||||
|
||||
#include <drm/ttm/ttm_placement.h>
|
||||
#include <drm/ttm/ttm_range_manager.h>
|
||||
|
|
@ -311,6 +312,13 @@ int __xe_ttm_vram_mgr_init(struct xe_device *xe, struct xe_ttm_vram_mgr *mgr,
|
|||
struct ttm_resource_manager *man = &mgr->manager;
|
||||
int err;
|
||||
|
||||
if (mem_type != XE_PL_STOLEN) {
|
||||
const char *name = mem_type == XE_PL_VRAM0 ? "vram0" : "vram1";
|
||||
man->cg = drmm_cgroup_register_region(&xe->drm, name, size);
|
||||
if (IS_ERR(man->cg))
|
||||
return PTR_ERR(man->cg);
|
||||
}
|
||||
|
||||
man->func = &xe_ttm_vram_mgr_func;
|
||||
mgr->mem_type = mem_type;
|
||||
mutex_init(&mgr->lock);
|
||||
|
|
|
|||
|
|
@ -34,6 +34,7 @@
|
|||
|
||||
#include <drm/drm_device.h>
|
||||
|
||||
struct dmem_cgroup_region;
|
||||
struct drm_fb_helper;
|
||||
struct drm_fb_helper_surface_size;
|
||||
struct drm_file;
|
||||
|
|
@ -436,6 +437,10 @@ void *__devm_drm_dev_alloc(struct device *parent,
|
|||
const struct drm_driver *driver,
|
||||
size_t size, size_t offset);
|
||||
|
||||
struct dmem_cgroup_region *
|
||||
drmm_cgroup_register_region(struct drm_device *dev,
|
||||
const char *region_name, u64 size);
|
||||
|
||||
/**
|
||||
* devm_drm_dev_alloc - Resource managed allocation of a &drm_device instance
|
||||
* @parent: Parent device object
|
||||
|
|
|
|||
|
|
@ -38,6 +38,7 @@
|
|||
#define TTM_MAX_BO_PRIORITY 4U
|
||||
#define TTM_NUM_MEM_TYPES 8
|
||||
|
||||
struct dmem_cgroup_device;
|
||||
struct ttm_device;
|
||||
struct ttm_resource_manager;
|
||||
struct ttm_resource;
|
||||
|
|
@ -211,6 +212,11 @@ struct ttm_resource_manager {
|
|||
* bdev->lru_lock.
|
||||
*/
|
||||
uint64_t usage;
|
||||
|
||||
/**
|
||||
* @cg: &dmem_cgroup_region used for memory accounting, if not NULL.
|
||||
*/
|
||||
struct dmem_cgroup_region *cg;
|
||||
};
|
||||
|
||||
/**
|
||||
|
|
@ -239,6 +245,7 @@ struct ttm_bus_placement {
|
|||
* @placement: Placement flags.
|
||||
* @bus: Placement on io bus accessible to the CPU
|
||||
* @bo: weak reference to the BO, protected by ttm_device::lru_lock
|
||||
* @css: cgroup state this resource is charged to
|
||||
*
|
||||
* Structure indicating the placement and space resources used by a
|
||||
* buffer object.
|
||||
|
|
@ -251,6 +258,8 @@ struct ttm_resource {
|
|||
struct ttm_bus_placement bus;
|
||||
struct ttm_buffer_object *bo;
|
||||
|
||||
struct dmem_cgroup_pool_state *css;
|
||||
|
||||
/**
|
||||
* @lru: Least recently used list, see &ttm_resource_manager.lru
|
||||
*/
|
||||
|
|
@ -432,7 +441,8 @@ void ttm_resource_fini(struct ttm_resource_manager *man,
|
|||
|
||||
int ttm_resource_alloc(struct ttm_buffer_object *bo,
|
||||
const struct ttm_place *place,
|
||||
struct ttm_resource **res);
|
||||
struct ttm_resource **res,
|
||||
struct dmem_cgroup_pool_state **ret_limit_pool);
|
||||
void ttm_resource_free(struct ttm_buffer_object *bo, struct ttm_resource **res);
|
||||
bool ttm_resource_intersects(struct ttm_device *bdev,
|
||||
struct ttm_resource *res,
|
||||
|
|
|
|||
66
include/linux/cgroup_dmem.h
Normal file
66
include/linux/cgroup_dmem.h
Normal file
|
|
@ -0,0 +1,66 @@
|
|||
/* SPDX-License-Identifier: MIT */
|
||||
/*
|
||||
* Copyright © 2023-2024 Intel Corporation
|
||||
*/
|
||||
|
||||
#ifndef _CGROUP_DMEM_H
|
||||
#define _CGROUP_DMEM_H
|
||||
|
||||
#include <linux/types.h>
|
||||
#include <linux/llist.h>
|
||||
|
||||
struct dmem_cgroup_pool_state;
|
||||
|
||||
/* Opaque definition of a cgroup region, used internally */
|
||||
struct dmem_cgroup_region;
|
||||
|
||||
#if IS_ENABLED(CONFIG_CGROUP_DMEM)
|
||||
struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *name_fmt, ...) __printf(2,3);
|
||||
void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region);
|
||||
int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
|
||||
struct dmem_cgroup_pool_state **ret_pool,
|
||||
struct dmem_cgroup_pool_state **ret_limit_pool);
|
||||
void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size);
|
||||
bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
|
||||
struct dmem_cgroup_pool_state *test_pool,
|
||||
bool ignore_low, bool *ret_hit_low);
|
||||
|
||||
void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool);
|
||||
#else
|
||||
static inline __printf(2,3) struct dmem_cgroup_region *
|
||||
dmem_cgroup_register_region(u64 size, const char *name_fmt, ...)
|
||||
{
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static inline void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
|
||||
{ }
|
||||
|
||||
static inline int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
|
||||
struct dmem_cgroup_pool_state **ret_pool,
|
||||
struct dmem_cgroup_pool_state **ret_limit_pool)
|
||||
{
|
||||
*ret_pool = NULL;
|
||||
|
||||
if (ret_limit_pool)
|
||||
*ret_limit_pool = NULL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static inline void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
|
||||
{ }
|
||||
|
||||
static inline
|
||||
bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
|
||||
struct dmem_cgroup_pool_state *test_pool,
|
||||
bool ignore_low, bool *ret_hit_low)
|
||||
{
|
||||
return true;
|
||||
}
|
||||
|
||||
static inline void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
|
||||
{ }
|
||||
|
||||
#endif
|
||||
#endif /* _CGROUP_DMEM_H */
|
||||
|
|
@ -65,6 +65,10 @@ SUBSYS(rdma)
|
|||
SUBSYS(misc)
|
||||
#endif
|
||||
|
||||
#if IS_ENABLED(CONFIG_CGROUP_DMEM)
|
||||
SUBSYS(dmem)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* The following subsystems are not supported on the default hierarchy.
|
||||
*/
|
||||
|
|
|
|||
|
|
@ -96,7 +96,7 @@ static inline void page_counter_reset_watermark(struct page_counter *counter)
|
|||
counter->watermark = usage;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
|
||||
void page_counter_calculate_protection(struct page_counter *root,
|
||||
struct page_counter *counter,
|
||||
bool recursive_protection);
|
||||
|
|
|
|||
10
init/Kconfig
10
init/Kconfig
|
|
@ -1128,6 +1128,7 @@ config CGROUP_PIDS
|
|||
|
||||
config CGROUP_RDMA
|
||||
bool "RDMA controller"
|
||||
select PAGE_COUNTER
|
||||
help
|
||||
Provides enforcement of RDMA resources defined by IB stack.
|
||||
It is fairly easy for consumers to exhaust RDMA resources, which
|
||||
|
|
@ -1136,6 +1137,15 @@ config CGROUP_RDMA
|
|||
Attaching processes with active RDMA resources to the cgroup
|
||||
hierarchy is allowed even if can cross the hierarchy's limit.
|
||||
|
||||
config CGROUP_DMEM
|
||||
bool "Device memory controller (DMEM)"
|
||||
help
|
||||
The DMEM controller allows compatible devices to restrict device
|
||||
memory usage based on the cgroup hierarchy.
|
||||
|
||||
As an example, it allows you to restrict VRAM usage for applications
|
||||
in the DRM subsystem.
|
||||
|
||||
config CGROUP_FREEZER
|
||||
bool "Freezer controller"
|
||||
help
|
||||
|
|
|
|||
|
|
@ -7,4 +7,5 @@ obj-$(CONFIG_CGROUP_RDMA) += rdma.o
|
|||
obj-$(CONFIG_CPUSETS) += cpuset.o
|
||||
obj-$(CONFIG_CPUSETS_V1) += cpuset-v1.o
|
||||
obj-$(CONFIG_CGROUP_MISC) += misc.o
|
||||
obj-$(CONFIG_CGROUP_DMEM) += dmem.o
|
||||
obj-$(CONFIG_CGROUP_DEBUG) += debug.o
|
||||
|
|
|
|||
861
kernel/cgroup/dmem.c
Normal file
861
kernel/cgroup/dmem.c
Normal file
|
|
@ -0,0 +1,861 @@
|
|||
// SPDX-License-Identifier: GPL-2.0
|
||||
/*
|
||||
* Copyright 2023-2024 Intel Corporation (Maarten Lankhorst <dev@lankhorst.se>)
|
||||
* Copyright 2024 Red Hat (Maxime Ripard <mripard@kernel.org>)
|
||||
* Partially based on the rdma and misc controllers, which bear the following copyrights:
|
||||
*
|
||||
* Copyright 2020 Google LLC
|
||||
* Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
|
||||
*/
|
||||
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/cgroup_dmem.h>
|
||||
#include <linux/list.h>
|
||||
#include <linux/mutex.h>
|
||||
#include <linux/page_counter.h>
|
||||
#include <linux/parser.h>
|
||||
#include <linux/slab.h>
|
||||
|
||||
struct dmem_cgroup_region {
|
||||
/**
|
||||
* @ref: References keeping the region alive.
|
||||
* Keeps the region reference alive after a succesful RCU lookup.
|
||||
*/
|
||||
struct kref ref;
|
||||
|
||||
/** @rcu: RCU head for freeing */
|
||||
struct rcu_head rcu;
|
||||
|
||||
/**
|
||||
* @region_node: Linked into &dmem_cgroup_regions list.
|
||||
* Protected by RCU and global spinlock.
|
||||
*/
|
||||
struct list_head region_node;
|
||||
|
||||
/**
|
||||
* @pools: List of pools linked to this region.
|
||||
* Protected by global spinlock only
|
||||
*/
|
||||
struct list_head pools;
|
||||
|
||||
/** @size: Size of region, in bytes */
|
||||
u64 size;
|
||||
|
||||
/** @name: Name describing the node, set by dmem_cgroup_register_region */
|
||||
char *name;
|
||||
|
||||
/**
|
||||
* @unregistered: Whether the region is unregistered by its caller.
|
||||
* No new pools should be added to the region afterwards.
|
||||
*/
|
||||
bool unregistered;
|
||||
};
|
||||
|
||||
struct dmemcg_state {
|
||||
struct cgroup_subsys_state css;
|
||||
|
||||
struct list_head pools;
|
||||
};
|
||||
|
||||
struct dmem_cgroup_pool_state {
|
||||
struct dmem_cgroup_region *region;
|
||||
struct dmemcg_state *cs;
|
||||
|
||||
/* css node, RCU protected against region teardown */
|
||||
struct list_head css_node;
|
||||
|
||||
/* dev node, no RCU protection required */
|
||||
struct list_head region_node;
|
||||
|
||||
struct rcu_head rcu;
|
||||
|
||||
struct page_counter cnt;
|
||||
|
||||
bool inited;
|
||||
};
|
||||
|
||||
/*
|
||||
* 3 operations require locking protection:
|
||||
* - Registering and unregistering region to/from list, requires global lock.
|
||||
* - Adding a dmem_cgroup_pool_state to a CSS, removing when CSS is freed.
|
||||
* - Adding a dmem_cgroup_pool_state to a region list.
|
||||
*
|
||||
* Since for the most common operations RCU provides enough protection, I
|
||||
* do not think more granular locking makes sense. Most protection is offered
|
||||
* by RCU and the lockless operating page_counter.
|
||||
*/
|
||||
static DEFINE_SPINLOCK(dmemcg_lock);
|
||||
static LIST_HEAD(dmem_cgroup_regions);
|
||||
|
||||
static inline struct dmemcg_state *
|
||||
css_to_dmemcs(struct cgroup_subsys_state *css)
|
||||
{
|
||||
return container_of(css, struct dmemcg_state, css);
|
||||
}
|
||||
|
||||
static inline struct dmemcg_state *get_current_dmemcs(void)
|
||||
{
|
||||
return css_to_dmemcs(task_get_css(current, dmem_cgrp_id));
|
||||
}
|
||||
|
||||
static struct dmemcg_state *parent_dmemcs(struct dmemcg_state *cg)
|
||||
{
|
||||
return cg->css.parent ? css_to_dmemcs(cg->css.parent) : NULL;
|
||||
}
|
||||
|
||||
static void free_cg_pool(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
list_del(&pool->region_node);
|
||||
kfree(pool);
|
||||
}
|
||||
|
||||
static void
|
||||
set_resource_min(struct dmem_cgroup_pool_state *pool, u64 val)
|
||||
{
|
||||
page_counter_set_min(&pool->cnt, val);
|
||||
}
|
||||
|
||||
static void
|
||||
set_resource_low(struct dmem_cgroup_pool_state *pool, u64 val)
|
||||
{
|
||||
page_counter_set_low(&pool->cnt, val);
|
||||
}
|
||||
|
||||
static void
|
||||
set_resource_max(struct dmem_cgroup_pool_state *pool, u64 val)
|
||||
{
|
||||
page_counter_set_max(&pool->cnt, val);
|
||||
}
|
||||
|
||||
static u64 get_resource_low(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
return pool ? READ_ONCE(pool->cnt.low) : 0;
|
||||
}
|
||||
|
||||
static u64 get_resource_min(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
return pool ? READ_ONCE(pool->cnt.min) : 0;
|
||||
}
|
||||
|
||||
static u64 get_resource_max(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
return pool ? READ_ONCE(pool->cnt.max) : PAGE_COUNTER_MAX;
|
||||
}
|
||||
|
||||
static u64 get_resource_current(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
return pool ? page_counter_read(&pool->cnt) : 0;
|
||||
}
|
||||
|
||||
static void reset_all_resource_limits(struct dmem_cgroup_pool_state *rpool)
|
||||
{
|
||||
set_resource_min(rpool, 0);
|
||||
set_resource_low(rpool, 0);
|
||||
set_resource_max(rpool, PAGE_COUNTER_MAX);
|
||||
}
|
||||
|
||||
static void dmemcs_offline(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct dmemcg_state *dmemcs = css_to_dmemcs(css);
|
||||
struct dmem_cgroup_pool_state *pool;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(pool, &dmemcs->pools, css_node)
|
||||
reset_all_resource_limits(pool);
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
static void dmemcs_free(struct cgroup_subsys_state *css)
|
||||
{
|
||||
struct dmemcg_state *dmemcs = css_to_dmemcs(css);
|
||||
struct dmem_cgroup_pool_state *pool, *next;
|
||||
|
||||
spin_lock(&dmemcg_lock);
|
||||
list_for_each_entry_safe(pool, next, &dmemcs->pools, css_node) {
|
||||
/*
|
||||
*The pool is dead and all references are 0,
|
||||
* no need for RCU protection with list_del_rcu or freeing.
|
||||
*/
|
||||
list_del(&pool->css_node);
|
||||
free_cg_pool(pool);
|
||||
}
|
||||
spin_unlock(&dmemcg_lock);
|
||||
|
||||
kfree(dmemcs);
|
||||
}
|
||||
|
||||
static struct cgroup_subsys_state *
|
||||
dmemcs_alloc(struct cgroup_subsys_state *parent_css)
|
||||
{
|
||||
struct dmemcg_state *dmemcs = kzalloc(sizeof(*dmemcs), GFP_KERNEL);
|
||||
if (!dmemcs)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
INIT_LIST_HEAD(&dmemcs->pools);
|
||||
return &dmemcs->css;
|
||||
}
|
||||
|
||||
static struct dmem_cgroup_pool_state *
|
||||
find_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region)
|
||||
{
|
||||
struct dmem_cgroup_pool_state *pool;
|
||||
|
||||
list_for_each_entry_rcu(pool, &dmemcs->pools, css_node, spin_is_locked(&dmemcg_lock))
|
||||
if (pool->region == region)
|
||||
return pool;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static struct dmem_cgroup_pool_state *pool_parent(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
if (!pool->cnt.parent)
|
||||
return NULL;
|
||||
|
||||
return container_of(pool->cnt.parent, typeof(*pool), cnt);
|
||||
}
|
||||
|
||||
static void
|
||||
dmem_cgroup_calculate_protection(struct dmem_cgroup_pool_state *limit_pool,
|
||||
struct dmem_cgroup_pool_state *test_pool)
|
||||
{
|
||||
struct page_counter *climit;
|
||||
struct cgroup_subsys_state *css, *next_css;
|
||||
struct dmemcg_state *dmemcg_iter;
|
||||
struct dmem_cgroup_pool_state *pool, *parent_pool;
|
||||
bool found_descendant;
|
||||
|
||||
climit = &limit_pool->cnt;
|
||||
|
||||
rcu_read_lock();
|
||||
parent_pool = pool = limit_pool;
|
||||
css = &limit_pool->cs->css;
|
||||
|
||||
/*
|
||||
* This logic is roughly equivalent to css_foreach_descendant_pre,
|
||||
* except we also track the parent pool to find out which pool we need
|
||||
* to calculate protection values for.
|
||||
*
|
||||
* We can stop the traversal once we find test_pool among the
|
||||
* descendants since we don't really care about any others.
|
||||
*/
|
||||
while (pool != test_pool) {
|
||||
next_css = css_next_child(NULL, css);
|
||||
if (next_css) {
|
||||
parent_pool = pool;
|
||||
} else {
|
||||
while (css != &limit_pool->cs->css) {
|
||||
next_css = css_next_child(css, css->parent);
|
||||
if (next_css)
|
||||
break;
|
||||
css = css->parent;
|
||||
parent_pool = pool_parent(parent_pool);
|
||||
}
|
||||
/*
|
||||
* We can only hit this when test_pool is not a
|
||||
* descendant of limit_pool.
|
||||
*/
|
||||
if (WARN_ON_ONCE(css == &limit_pool->cs->css))
|
||||
break;
|
||||
}
|
||||
css = next_css;
|
||||
|
||||
found_descendant = false;
|
||||
dmemcg_iter = container_of(css, struct dmemcg_state, css);
|
||||
|
||||
list_for_each_entry_rcu(pool, &dmemcg_iter->pools, css_node) {
|
||||
if (pool_parent(pool) == parent_pool) {
|
||||
found_descendant = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
if (!found_descendant)
|
||||
continue;
|
||||
|
||||
page_counter_calculate_protection(
|
||||
climit, &pool->cnt, true);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
}
|
||||
|
||||
/**
|
||||
* dmem_cgroup_state_evict_valuable() - Check if we should evict from test_pool
|
||||
* @dev: &dmem_cgroup_region
|
||||
* @index: The index number of the region being tested.
|
||||
* @limit_pool: The pool for which we hit limits
|
||||
* @test_pool: The pool for which to test
|
||||
* @ignore_low: Whether we have to respect low watermarks.
|
||||
* @ret_hit_low: Pointer to whether it makes sense to consider low watermark.
|
||||
*
|
||||
* This function returns true if we can evict from @test_pool, false if not.
|
||||
* When returning false and @ignore_low is false, @ret_hit_low may
|
||||
* be set to true to indicate this function can be retried with @ignore_low
|
||||
* set to true.
|
||||
*
|
||||
* Return: bool
|
||||
*/
|
||||
bool dmem_cgroup_state_evict_valuable(struct dmem_cgroup_pool_state *limit_pool,
|
||||
struct dmem_cgroup_pool_state *test_pool,
|
||||
bool ignore_low, bool *ret_hit_low)
|
||||
{
|
||||
struct dmem_cgroup_pool_state *pool = test_pool;
|
||||
struct page_counter *climit, *ctest;
|
||||
u64 used, min, low;
|
||||
|
||||
/* Can always evict from current pool, despite limits */
|
||||
if (limit_pool == test_pool)
|
||||
return true;
|
||||
|
||||
if (limit_pool) {
|
||||
if (!parent_dmemcs(limit_pool->cs))
|
||||
return true;
|
||||
|
||||
for (pool = test_pool; pool && limit_pool != pool; pool = pool_parent(pool))
|
||||
{}
|
||||
|
||||
if (!pool)
|
||||
return false;
|
||||
} else {
|
||||
/*
|
||||
* If there is no cgroup limiting memory usage, use the root
|
||||
* cgroup instead for limit calculations.
|
||||
*/
|
||||
for (limit_pool = test_pool; pool_parent(limit_pool); limit_pool = pool_parent(limit_pool))
|
||||
{}
|
||||
}
|
||||
|
||||
climit = &limit_pool->cnt;
|
||||
ctest = &test_pool->cnt;
|
||||
|
||||
dmem_cgroup_calculate_protection(limit_pool, test_pool);
|
||||
|
||||
used = page_counter_read(ctest);
|
||||
min = READ_ONCE(ctest->emin);
|
||||
|
||||
if (used <= min)
|
||||
return false;
|
||||
|
||||
if (!ignore_low) {
|
||||
low = READ_ONCE(ctest->elow);
|
||||
if (used > low)
|
||||
return true;
|
||||
|
||||
*ret_hit_low = true;
|
||||
return false;
|
||||
}
|
||||
return true;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dmem_cgroup_state_evict_valuable);
|
||||
|
||||
static struct dmem_cgroup_pool_state *
|
||||
alloc_pool_single(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
|
||||
struct dmem_cgroup_pool_state **allocpool)
|
||||
{
|
||||
struct dmemcg_state *parent = parent_dmemcs(dmemcs);
|
||||
struct dmem_cgroup_pool_state *pool, *ppool = NULL;
|
||||
|
||||
if (!*allocpool) {
|
||||
pool = kzalloc(sizeof(*pool), GFP_NOWAIT);
|
||||
if (!pool)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
} else {
|
||||
pool = *allocpool;
|
||||
*allocpool = NULL;
|
||||
}
|
||||
|
||||
pool->region = region;
|
||||
pool->cs = dmemcs;
|
||||
|
||||
if (parent)
|
||||
ppool = find_cg_pool_locked(parent, region);
|
||||
|
||||
page_counter_init(&pool->cnt,
|
||||
ppool ? &ppool->cnt : NULL, true);
|
||||
reset_all_resource_limits(pool);
|
||||
|
||||
list_add_tail_rcu(&pool->css_node, &dmemcs->pools);
|
||||
list_add_tail(&pool->region_node, ®ion->pools);
|
||||
|
||||
if (!parent)
|
||||
pool->inited = true;
|
||||
else
|
||||
pool->inited = ppool ? ppool->inited : false;
|
||||
return pool;
|
||||
}
|
||||
|
||||
static struct dmem_cgroup_pool_state *
|
||||
get_cg_pool_locked(struct dmemcg_state *dmemcs, struct dmem_cgroup_region *region,
|
||||
struct dmem_cgroup_pool_state **allocpool)
|
||||
{
|
||||
struct dmem_cgroup_pool_state *pool, *ppool, *retpool;
|
||||
struct dmemcg_state *p, *pp;
|
||||
|
||||
/*
|
||||
* Recursively create pool, we may not initialize yet on
|
||||
* recursion, this is done as a separate step.
|
||||
*/
|
||||
for (p = dmemcs; p; p = parent_dmemcs(p)) {
|
||||
pool = find_cg_pool_locked(p, region);
|
||||
if (!pool)
|
||||
pool = alloc_pool_single(p, region, allocpool);
|
||||
|
||||
if (IS_ERR(pool))
|
||||
return pool;
|
||||
|
||||
if (p == dmemcs && pool->inited)
|
||||
return pool;
|
||||
|
||||
if (pool->inited)
|
||||
break;
|
||||
}
|
||||
|
||||
retpool = pool = find_cg_pool_locked(dmemcs, region);
|
||||
for (p = dmemcs, pp = parent_dmemcs(dmemcs); pp; p = pp, pp = parent_dmemcs(p)) {
|
||||
if (pool->inited)
|
||||
break;
|
||||
|
||||
/* ppool was created if it didn't exist by above loop. */
|
||||
ppool = find_cg_pool_locked(pp, region);
|
||||
|
||||
/* Fix up parent links, mark as inited. */
|
||||
pool->cnt.parent = &ppool->cnt;
|
||||
pool->inited = true;
|
||||
|
||||
pool = ppool;
|
||||
}
|
||||
|
||||
return retpool;
|
||||
}
|
||||
|
||||
static void dmemcg_free_rcu(struct rcu_head *rcu)
|
||||
{
|
||||
struct dmem_cgroup_region *region = container_of(rcu, typeof(*region), rcu);
|
||||
struct dmem_cgroup_pool_state *pool, *next;
|
||||
|
||||
list_for_each_entry_safe(pool, next, ®ion->pools, region_node)
|
||||
free_cg_pool(pool);
|
||||
kfree(region->name);
|
||||
kfree(region);
|
||||
}
|
||||
|
||||
static void dmemcg_free_region(struct kref *ref)
|
||||
{
|
||||
struct dmem_cgroup_region *cgregion = container_of(ref, typeof(*cgregion), ref);
|
||||
|
||||
call_rcu(&cgregion->rcu, dmemcg_free_rcu);
|
||||
}
|
||||
|
||||
/**
|
||||
* dmem_cgroup_unregister_region() - Unregister a previously registered region.
|
||||
* @region: The region to unregister.
|
||||
*
|
||||
* This function undoes dmem_cgroup_register_region.
|
||||
*/
|
||||
void dmem_cgroup_unregister_region(struct dmem_cgroup_region *region)
|
||||
{
|
||||
struct list_head *entry;
|
||||
|
||||
if (!region)
|
||||
return;
|
||||
|
||||
spin_lock(&dmemcg_lock);
|
||||
|
||||
/* Remove from global region list */
|
||||
list_del_rcu(®ion->region_node);
|
||||
|
||||
list_for_each_rcu(entry, ®ion->pools) {
|
||||
struct dmem_cgroup_pool_state *pool =
|
||||
container_of(entry, typeof(*pool), region_node);
|
||||
|
||||
list_del_rcu(&pool->css_node);
|
||||
}
|
||||
|
||||
/*
|
||||
* Ensure any RCU based lookups fail. Additionally,
|
||||
* no new pools should be added to the dead region
|
||||
* by get_cg_pool_unlocked.
|
||||
*/
|
||||
region->unregistered = true;
|
||||
spin_unlock(&dmemcg_lock);
|
||||
|
||||
kref_put(®ion->ref, dmemcg_free_region);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dmem_cgroup_unregister_region);
|
||||
|
||||
/**
|
||||
* dmem_cgroup_register_region() - Register a regions for dev cgroup.
|
||||
* @size: Size of region to register, in bytes.
|
||||
* @fmt: Region parameters to register
|
||||
*
|
||||
* This function registers a node in the dmem cgroup with the
|
||||
* name given. After calling this function, the region can be
|
||||
* used for allocations.
|
||||
*
|
||||
* Return: NULL or a struct on success, PTR_ERR on failure.
|
||||
*/
|
||||
struct dmem_cgroup_region *dmem_cgroup_register_region(u64 size, const char *fmt, ...)
|
||||
{
|
||||
struct dmem_cgroup_region *ret;
|
||||
char *region_name;
|
||||
va_list ap;
|
||||
|
||||
if (!size)
|
||||
return NULL;
|
||||
|
||||
va_start(ap, fmt);
|
||||
region_name = kvasprintf(GFP_KERNEL, fmt, ap);
|
||||
va_end(ap);
|
||||
if (!region_name)
|
||||
return ERR_PTR(-ENOMEM);
|
||||
|
||||
ret = kzalloc(sizeof(*ret), GFP_KERNEL);
|
||||
if (!ret) {
|
||||
kfree(region_name);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
INIT_LIST_HEAD(&ret->pools);
|
||||
ret->name = region_name;
|
||||
ret->size = size;
|
||||
kref_init(&ret->ref);
|
||||
|
||||
spin_lock(&dmemcg_lock);
|
||||
list_add_tail_rcu(&ret->region_node, &dmem_cgroup_regions);
|
||||
spin_unlock(&dmemcg_lock);
|
||||
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dmem_cgroup_register_region);
|
||||
|
||||
static struct dmem_cgroup_region *dmemcg_get_region_by_name(const char *name)
|
||||
{
|
||||
struct dmem_cgroup_region *region;
|
||||
|
||||
list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node, spin_is_locked(&dmemcg_lock))
|
||||
if (!strcmp(name, region->name) &&
|
||||
kref_get_unless_zero(®ion->ref))
|
||||
return region;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
/**
|
||||
* dmem_cgroup_pool_state_put() - Drop a reference to a dmem_cgroup_pool_state
|
||||
* @pool: &dmem_cgroup_pool_state
|
||||
*
|
||||
* Called to drop a reference to the limiting pool returned by
|
||||
* dmem_cgroup_try_charge().
|
||||
*/
|
||||
void dmem_cgroup_pool_state_put(struct dmem_cgroup_pool_state *pool)
|
||||
{
|
||||
if (pool)
|
||||
css_put(&pool->cs->css);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dmem_cgroup_pool_state_put);
|
||||
|
||||
static struct dmem_cgroup_pool_state *
|
||||
get_cg_pool_unlocked(struct dmemcg_state *cg, struct dmem_cgroup_region *region)
|
||||
{
|
||||
struct dmem_cgroup_pool_state *pool, *allocpool = NULL;
|
||||
|
||||
/* fastpath lookup? */
|
||||
rcu_read_lock();
|
||||
pool = find_cg_pool_locked(cg, region);
|
||||
if (pool && !READ_ONCE(pool->inited))
|
||||
pool = NULL;
|
||||
rcu_read_unlock();
|
||||
|
||||
while (!pool) {
|
||||
spin_lock(&dmemcg_lock);
|
||||
if (!region->unregistered)
|
||||
pool = get_cg_pool_locked(cg, region, &allocpool);
|
||||
else
|
||||
pool = ERR_PTR(-ENODEV);
|
||||
spin_unlock(&dmemcg_lock);
|
||||
|
||||
if (pool == ERR_PTR(-ENOMEM)) {
|
||||
pool = NULL;
|
||||
if (WARN_ON(allocpool))
|
||||
continue;
|
||||
|
||||
allocpool = kzalloc(sizeof(*allocpool), GFP_KERNEL);
|
||||
if (allocpool) {
|
||||
pool = NULL;
|
||||
continue;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
kfree(allocpool);
|
||||
return pool;
|
||||
}
|
||||
|
||||
/**
|
||||
* dmem_cgroup_uncharge() - Uncharge a pool.
|
||||
* @pool: Pool to uncharge.
|
||||
* @size: Size to uncharge.
|
||||
*
|
||||
* Undoes the effects of dmem_cgroup_try_charge.
|
||||
* Must be called with the returned pool as argument,
|
||||
* and same @index and @size.
|
||||
*/
|
||||
void dmem_cgroup_uncharge(struct dmem_cgroup_pool_state *pool, u64 size)
|
||||
{
|
||||
if (!pool)
|
||||
return;
|
||||
|
||||
page_counter_uncharge(&pool->cnt, size);
|
||||
css_put(&pool->cs->css);
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dmem_cgroup_uncharge);
|
||||
|
||||
/**
|
||||
* dmem_cgroup_try_charge() - Try charging a new allocation to a region.
|
||||
* @dev: Device to charge
|
||||
* @size: Size (in bytes) to charge.
|
||||
* @ret_pool: On succesfull allocation, the pool that is charged.
|
||||
* @ret_limit_pool: On a failed allocation, the limiting pool.
|
||||
*
|
||||
* This function charges the current pool for @dev with region at @index for a
|
||||
* size of @size bytes.
|
||||
*
|
||||
* If the function succeeds, @ret_pool is set, which must be passed to
|
||||
* dmem_cgroup_uncharge() when undoing the allocation.
|
||||
*
|
||||
* When this function fails with -EAGAIN and @ret_limit_pool is non-null, it
|
||||
* will be set to the pool for which the limit is hit. This can be used for
|
||||
* eviction as argument to dmem_cgroup_evict_valuable(). This reference must be freed
|
||||
* with @dmem_cgroup_pool_state_put().
|
||||
*
|
||||
* Return: 0 on success, -EAGAIN on hitting a limit, or a negative errno on failure.
|
||||
*/
|
||||
int dmem_cgroup_try_charge(struct dmem_cgroup_region *region, u64 size,
|
||||
struct dmem_cgroup_pool_state **ret_pool,
|
||||
struct dmem_cgroup_pool_state **ret_limit_pool)
|
||||
{
|
||||
struct dmemcg_state *cg;
|
||||
struct dmem_cgroup_pool_state *pool;
|
||||
struct page_counter *fail;
|
||||
int ret;
|
||||
|
||||
*ret_pool = NULL;
|
||||
if (ret_limit_pool)
|
||||
*ret_limit_pool = NULL;
|
||||
|
||||
/*
|
||||
* hold on to css, as cgroup can be removed but resource
|
||||
* accounting happens on css.
|
||||
*/
|
||||
cg = get_current_dmemcs();
|
||||
|
||||
pool = get_cg_pool_unlocked(cg, region);
|
||||
if (IS_ERR(pool)) {
|
||||
ret = PTR_ERR(pool);
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (!page_counter_try_charge(&pool->cnt, size, &fail)) {
|
||||
if (ret_limit_pool) {
|
||||
*ret_limit_pool = container_of(fail, struct dmem_cgroup_pool_state, cnt);
|
||||
css_get(&(*ret_limit_pool)->cs->css);
|
||||
}
|
||||
ret = -EAGAIN;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/* On success, reference from get_current_dmemcs is transferred to *ret_pool */
|
||||
*ret_pool = pool;
|
||||
return 0;
|
||||
|
||||
err:
|
||||
css_put(&cg->css);
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(dmem_cgroup_try_charge);
|
||||
|
||||
static int dmem_cgroup_region_capacity_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
struct dmem_cgroup_region *region;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
|
||||
seq_puts(sf, region->name);
|
||||
seq_printf(sf, " %llu\n", region->size);
|
||||
}
|
||||
rcu_read_unlock();
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dmemcg_parse_limit(char *options, struct dmem_cgroup_region *region,
|
||||
u64 *new_limit)
|
||||
{
|
||||
char *end;
|
||||
|
||||
if (!strcmp(options, "max")) {
|
||||
*new_limit = PAGE_COUNTER_MAX;
|
||||
return 0;
|
||||
}
|
||||
|
||||
*new_limit = memparse(options, &end);
|
||||
if (*end != '\0')
|
||||
return -EINVAL;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static ssize_t dmemcg_limit_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off,
|
||||
void (*apply)(struct dmem_cgroup_pool_state *, u64))
|
||||
{
|
||||
struct dmemcg_state *dmemcs = css_to_dmemcs(of_css(of));
|
||||
int err = 0;
|
||||
|
||||
while (buf && !err) {
|
||||
struct dmem_cgroup_pool_state *pool = NULL;
|
||||
char *options, *region_name;
|
||||
struct dmem_cgroup_region *region;
|
||||
u64 new_limit;
|
||||
|
||||
options = buf;
|
||||
buf = strchr(buf, '\n');
|
||||
if (buf)
|
||||
*buf++ = '\0';
|
||||
|
||||
options = strstrip(options);
|
||||
|
||||
/* eat empty lines */
|
||||
if (!options[0])
|
||||
continue;
|
||||
|
||||
region_name = strsep(&options, " \t");
|
||||
if (!region_name[0])
|
||||
continue;
|
||||
|
||||
rcu_read_lock();
|
||||
region = dmemcg_get_region_by_name(region_name);
|
||||
rcu_read_unlock();
|
||||
|
||||
if (!region)
|
||||
return -EINVAL;
|
||||
|
||||
err = dmemcg_parse_limit(options, region, &new_limit);
|
||||
if (err < 0)
|
||||
goto out_put;
|
||||
|
||||
pool = get_cg_pool_unlocked(dmemcs, region);
|
||||
if (IS_ERR(pool)) {
|
||||
err = PTR_ERR(pool);
|
||||
goto out_put;
|
||||
}
|
||||
|
||||
/* And commit */
|
||||
apply(pool, new_limit);
|
||||
|
||||
out_put:
|
||||
kref_put(®ion->ref, dmemcg_free_region);
|
||||
}
|
||||
|
||||
|
||||
return err ?: nbytes;
|
||||
}
|
||||
|
||||
static int dmemcg_limit_show(struct seq_file *sf, void *v,
|
||||
u64 (*fn)(struct dmem_cgroup_pool_state *))
|
||||
{
|
||||
struct dmemcg_state *dmemcs = css_to_dmemcs(seq_css(sf));
|
||||
struct dmem_cgroup_region *region;
|
||||
|
||||
rcu_read_lock();
|
||||
list_for_each_entry_rcu(region, &dmem_cgroup_regions, region_node) {
|
||||
struct dmem_cgroup_pool_state *pool = find_cg_pool_locked(dmemcs, region);
|
||||
u64 val;
|
||||
|
||||
seq_puts(sf, region->name);
|
||||
|
||||
val = fn(pool);
|
||||
if (val < PAGE_COUNTER_MAX)
|
||||
seq_printf(sf, " %lld\n", val);
|
||||
else
|
||||
seq_puts(sf, " max\n");
|
||||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int dmem_cgroup_region_current_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
return dmemcg_limit_show(sf, v, get_resource_current);
|
||||
}
|
||||
|
||||
static int dmem_cgroup_region_min_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
return dmemcg_limit_show(sf, v, get_resource_min);
|
||||
}
|
||||
|
||||
static ssize_t dmem_cgroup_region_min_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_min);
|
||||
}
|
||||
|
||||
static int dmem_cgroup_region_low_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
return dmemcg_limit_show(sf, v, get_resource_low);
|
||||
}
|
||||
|
||||
static ssize_t dmem_cgroup_region_low_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_low);
|
||||
}
|
||||
|
||||
static int dmem_cgroup_region_max_show(struct seq_file *sf, void *v)
|
||||
{
|
||||
return dmemcg_limit_show(sf, v, get_resource_max);
|
||||
}
|
||||
|
||||
static ssize_t dmem_cgroup_region_max_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
return dmemcg_limit_write(of, buf, nbytes, off, set_resource_max);
|
||||
}
|
||||
|
||||
static struct cftype files[] = {
|
||||
{
|
||||
.name = "capacity",
|
||||
.seq_show = dmem_cgroup_region_capacity_show,
|
||||
.flags = CFTYPE_ONLY_ON_ROOT,
|
||||
},
|
||||
{
|
||||
.name = "current",
|
||||
.seq_show = dmem_cgroup_region_current_show,
|
||||
},
|
||||
{
|
||||
.name = "min",
|
||||
.write = dmem_cgroup_region_min_write,
|
||||
.seq_show = dmem_cgroup_region_min_show,
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
},
|
||||
{
|
||||
.name = "low",
|
||||
.write = dmem_cgroup_region_low_write,
|
||||
.seq_show = dmem_cgroup_region_low_show,
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
},
|
||||
{
|
||||
.name = "max",
|
||||
.write = dmem_cgroup_region_max_write,
|
||||
.seq_show = dmem_cgroup_region_max_show,
|
||||
.flags = CFTYPE_NOT_ON_ROOT,
|
||||
},
|
||||
{ } /* Zero entry terminates. */
|
||||
};
|
||||
|
||||
struct cgroup_subsys dmem_cgrp_subsys = {
|
||||
.css_alloc = dmemcs_alloc,
|
||||
.css_free = dmemcs_free,
|
||||
.css_offline = dmemcs_offline,
|
||||
.legacy_cftypes = files,
|
||||
.dfl_cftypes = files,
|
||||
};
|
||||
|
|
@ -288,7 +288,7 @@ int page_counter_memparse(const char *buf, const char *max,
|
|||
}
|
||||
|
||||
|
||||
#ifdef CONFIG_MEMCG
|
||||
#if IS_ENABLED(CONFIG_MEMCG) || IS_ENABLED(CONFIG_CGROUP_DMEM)
|
||||
/*
|
||||
* This function calculates an individual page counter's effective
|
||||
* protection which is derived from its own memory.min/low, its
|
||||
|
|
@ -460,4 +460,4 @@ void page_counter_calculate_protection(struct page_counter *root,
|
|||
atomic_long_read(&parent->children_low_usage),
|
||||
recursive_protection));
|
||||
}
|
||||
#endif /* CONFIG_MEMCG */
|
||||
#endif /* CONFIG_MEMCG || CONFIG_CGROUP_DMEM */
|
||||
|
|
|
|||
Loading…
Reference in New Issue
Block a user