1 // SPDX-License-Identifier: GPL-2.0-only
3 * RDMA resource limiting controller for cgroups.
5 * Used to allow a cgroup hierarchy to stop processes from consuming
6 * additional RDMA resources after a certain limit is reached.
8 * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
11 #include <linux/bitops.h>
12 #include <linux/slab.h>
13 #include <linux/seq_file.h>
14 #include <linux/cgroup.h>
15 #include <linux/parser.h>
16 #include <linux/cgroup_rdma.h>
18 #define RDMACG_MAX_STR "max"
21 * Protects list of resource pools maintained on per cgroup basis
22 * and rdma device list.
24 static DEFINE_MUTEX(rdmacg_mutex);
25 static LIST_HEAD(rdmacg_devices);
27 enum rdmacg_file_type {
28 RDMACG_RESOURCE_TYPE_MAX,
29 RDMACG_RESOURCE_TYPE_STAT,
33 * resource table definition as to be seen by the user.
34 * Need to add entries to it when more resources are
35 * added/defined at IB verb/core layer.
37 static char const *rdmacg_resource_names[] = {
38 [RDMACG_RESOURCE_HCA_HANDLE] = "hca_handle",
39 [RDMACG_RESOURCE_HCA_OBJECT] = "hca_object",
42 /* resource tracker for each resource of rdma cgroup */
43 struct rdmacg_resource {
49 * resource pool object which represents per cgroup, per device
50 * resources. There are multiple instances of this object per cgroup,
51 * therefore it cannot be embedded within rdma_cgroup structure. It
52 * is maintained as list.
54 struct rdmacg_resource_pool {
55 struct rdmacg_device *device;
56 struct rdmacg_resource resources[RDMACG_RESOURCE_MAX];
58 struct list_head cg_node;
59 struct list_head dev_node;
61 /* count active user tasks of this pool */
63 /* total number counts which are set to max */
67 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
69 return container_of(css, struct rdma_cgroup, css);
72 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
74 return css_rdmacg(cg->css.parent);
77 static inline struct rdma_cgroup *get_current_rdmacg(void)
79 return css_rdmacg(task_get_css(current, rdma_cgrp_id));
82 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
83 int index, int new_max)
85 if (new_max == S32_MAX) {
86 if (rpool->resources[index].max != S32_MAX)
89 if (rpool->resources[index].max == S32_MAX)
92 rpool->resources[index].max = new_max;
95 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
99 for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
100 set_resource_limit(rpool, i, S32_MAX);
103 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
105 lockdep_assert_held(&rdmacg_mutex);
107 list_del(&rpool->cg_node);
108 list_del(&rpool->dev_node);
112 static struct rdmacg_resource_pool *
113 find_cg_rpool_locked(struct rdma_cgroup *cg,
114 struct rdmacg_device *device)
117 struct rdmacg_resource_pool *pool;
119 lockdep_assert_held(&rdmacg_mutex);
121 list_for_each_entry(pool, &cg->rpools, cg_node)
122 if (pool->device == device)
128 static struct rdmacg_resource_pool *
129 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
131 struct rdmacg_resource_pool *rpool;
133 rpool = find_cg_rpool_locked(cg, device);
137 rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
139 return ERR_PTR(-ENOMEM);
141 rpool->device = device;
142 set_all_resource_max_limit(rpool);
144 INIT_LIST_HEAD(&rpool->cg_node);
145 INIT_LIST_HEAD(&rpool->dev_node);
146 list_add_tail(&rpool->cg_node, &cg->rpools);
147 list_add_tail(&rpool->dev_node, &device->rpools);
152 * uncharge_cg_locked - uncharge resource for rdma cgroup
153 * @cg: pointer to cg to uncharge and all parents in hierarchy
154 * @device: pointer to rdmacg device
155 * @index: index of the resource to uncharge in cg (resource pool)
157 * It also frees the resource pool which was created as part of
158 * charging operation when there are no resources attached to
162 uncharge_cg_locked(struct rdma_cgroup *cg,
163 struct rdmacg_device *device,
164 enum rdmacg_resource_type index)
166 struct rdmacg_resource_pool *rpool;
168 rpool = find_cg_rpool_locked(cg, device);
171 * rpool cannot be null at this stage. Let kernel operate in case
172 * if there a bug in IB stack or rdma controller, instead of crashing
175 if (unlikely(!rpool)) {
176 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180 rpool->resources[index].usage--;
183 * A negative count (or overflow) is invalid,
184 * it indicates a bug in the rdma controller.
186 WARN_ON_ONCE(rpool->resources[index].usage < 0);
188 if (rpool->usage_sum == 0 &&
189 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
191 * No user of the rpool and all entries are set to max, so
192 * safe to delete this rpool.
194 free_cg_rpool_locked(rpool);
199 * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
200 * @device: pointer to rdmacg device
201 * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
203 * @index: index of the resource to uncharge in cg in given resource pool
205 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
206 struct rdmacg_device *device,
207 struct rdma_cgroup *stop_cg,
208 enum rdmacg_resource_type index)
210 struct rdma_cgroup *p;
212 mutex_lock(&rdmacg_mutex);
214 for (p = cg; p != stop_cg; p = parent_rdmacg(p))
215 uncharge_cg_locked(p, device, index);
217 mutex_unlock(&rdmacg_mutex);
223 * rdmacg_uncharge - hierarchically uncharge rdma resource count
224 * @device: pointer to rdmacg device
225 * @index: index of the resource to uncharge in cgroup in given resource pool
227 void rdmacg_uncharge(struct rdma_cgroup *cg,
228 struct rdmacg_device *device,
229 enum rdmacg_resource_type index)
231 if (index >= RDMACG_RESOURCE_MAX)
234 rdmacg_uncharge_hierarchy(cg, device, NULL, index);
236 EXPORT_SYMBOL(rdmacg_uncharge);
239 * rdmacg_try_charge - hierarchically try to charge the rdma resource
240 * @rdmacg: pointer to rdma cgroup which will own this resource
241 * @device: pointer to rdmacg device
242 * @index: index of the resource to charge in cgroup (resource pool)
244 * This function follows charging resource in hierarchical way.
245 * It will fail if the charge would cause the new value to exceed the
246 * hierarchical limit.
247 * Returns 0 if the charge succeeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
248 * Returns pointer to rdmacg for this resource when charging is successful.
250 * Charger needs to account resources on two criteria.
251 * (a) per cgroup & (b) per device resource usage.
252 * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
253 * the configured limits. Per device provides granular configuration
254 * in multi device usage. It allocates resource pool in the hierarchy
255 * for each parent it come across for first resource. Later on resource
256 * pool will be available. Therefore it will be much faster thereon
257 * to charge/uncharge.
259 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
260 struct rdmacg_device *device,
261 enum rdmacg_resource_type index)
263 struct rdma_cgroup *cg, *p;
264 struct rdmacg_resource_pool *rpool;
268 if (index >= RDMACG_RESOURCE_MAX)
272 * hold on to css, as cgroup can be removed but resource
273 * accounting happens on css.
275 cg = get_current_rdmacg();
277 mutex_lock(&rdmacg_mutex);
278 for (p = cg; p; p = parent_rdmacg(p)) {
279 rpool = get_cg_rpool_locked(p, device);
281 ret = PTR_ERR(rpool);
284 new = rpool->resources[index].usage + 1;
285 if (new > rpool->resources[index].max) {
289 rpool->resources[index].usage = new;
294 mutex_unlock(&rdmacg_mutex);
300 mutex_unlock(&rdmacg_mutex);
301 rdmacg_uncharge_hierarchy(cg, device, p, index);
304 EXPORT_SYMBOL(rdmacg_try_charge);
307 * rdmacg_register_device - register rdmacg device to rdma controller.
308 * @device: pointer to rdmacg device whose resources need to be accounted.
310 * If IB stack wish a device to participate in rdma cgroup resource
311 * tracking, it must invoke this API to register with rdma cgroup before
312 * any user space application can start using the RDMA resources.
314 void rdmacg_register_device(struct rdmacg_device *device)
316 INIT_LIST_HEAD(&device->dev_node);
317 INIT_LIST_HEAD(&device->rpools);
319 mutex_lock(&rdmacg_mutex);
320 list_add_tail(&device->dev_node, &rdmacg_devices);
321 mutex_unlock(&rdmacg_mutex);
323 EXPORT_SYMBOL(rdmacg_register_device);
326 * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
327 * @device: pointer to rdmacg device which was previously registered with rdma
328 * controller using rdmacg_register_device().
330 * IB stack must invoke this after all the resources of the IB device
331 * are destroyed and after ensuring that no more resources will be created
332 * when this API is invoked.
334 void rdmacg_unregister_device(struct rdmacg_device *device)
336 struct rdmacg_resource_pool *rpool, *tmp;
339 * Synchronize with any active resource settings,
340 * usage query happening via configfs.
342 mutex_lock(&rdmacg_mutex);
343 list_del_init(&device->dev_node);
346 * Now that this device is off the cgroup list, its safe to free
347 * all the rpool resources.
349 list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
350 free_cg_rpool_locked(rpool);
352 mutex_unlock(&rdmacg_mutex);
354 EXPORT_SYMBOL(rdmacg_unregister_device);
356 static int parse_resource(char *c, int *intval)
359 char *name, *value = c;
363 name = strsep(&value, "=");
367 i = match_string(rdmacg_resource_names, RDMACG_RESOURCE_MAX, name);
374 argstr.to = value + len;
376 ret = match_int(&argstr, intval);
382 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
389 static int rdmacg_parse_limits(char *options,
390 int *new_limits, unsigned long *enables)
395 /* parse resource options */
396 while ((c = strsep(&options, " ")) != NULL) {
399 index = parse_resource(c, &intval);
403 new_limits[index] = intval;
404 *enables |= BIT(index);
412 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
414 struct rdmacg_device *device;
416 lockdep_assert_held(&rdmacg_mutex);
418 list_for_each_entry(device, &rdmacg_devices, dev_node)
419 if (!strcmp(name, device->name))
425 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
426 char *buf, size_t nbytes, loff_t off)
428 struct rdma_cgroup *cg = css_rdmacg(of_css(of));
429 const char *dev_name;
430 struct rdmacg_resource_pool *rpool;
431 struct rdmacg_device *device;
432 char *options = strstrip(buf);
434 unsigned long enables = 0;
437 /* extract the device name first */
438 dev_name = strsep(&options, " ");
444 new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
450 ret = rdmacg_parse_limits(options, new_limits, &enables);
454 /* acquire lock to synchronize with hot plug devices */
455 mutex_lock(&rdmacg_mutex);
457 device = rdmacg_get_device_locked(dev_name);
463 rpool = get_cg_rpool_locked(cg, device);
465 ret = PTR_ERR(rpool);
469 /* now set the new limits of the rpool */
470 for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
471 set_resource_limit(rpool, i, new_limits[i]);
473 if (rpool->usage_sum == 0 &&
474 rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
476 * No user of the rpool and all entries are set to max, so
477 * safe to delete this rpool.
479 free_cg_rpool_locked(rpool);
483 mutex_unlock(&rdmacg_mutex);
489 return ret ?: nbytes;
492 static void print_rpool_values(struct seq_file *sf,
493 struct rdmacg_resource_pool *rpool)
495 enum rdmacg_file_type sf_type;
499 sf_type = seq_cft(sf)->private;
501 for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
502 seq_puts(sf, rdmacg_resource_names[i]);
504 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
506 value = rpool->resources[i].max;
511 value = rpool->resources[i].usage;
516 if (value == S32_MAX)
517 seq_puts(sf, RDMACG_MAX_STR);
519 seq_printf(sf, "%d", value);
524 static int rdmacg_resource_read(struct seq_file *sf, void *v)
526 struct rdmacg_device *device;
527 struct rdmacg_resource_pool *rpool;
528 struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
530 mutex_lock(&rdmacg_mutex);
532 list_for_each_entry(device, &rdmacg_devices, dev_node) {
533 seq_printf(sf, "%s ", device->name);
535 rpool = find_cg_rpool_locked(cg, device);
536 print_rpool_values(sf, rpool);
541 mutex_unlock(&rdmacg_mutex);
545 static struct cftype rdmacg_files[] = {
548 .write = rdmacg_resource_set_max,
549 .seq_show = rdmacg_resource_read,
550 .private = RDMACG_RESOURCE_TYPE_MAX,
551 .flags = CFTYPE_NOT_ON_ROOT,
555 .seq_show = rdmacg_resource_read,
556 .private = RDMACG_RESOURCE_TYPE_STAT,
557 .flags = CFTYPE_NOT_ON_ROOT,
562 static struct cgroup_subsys_state *
563 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
565 struct rdma_cgroup *cg;
567 cg = kzalloc(sizeof(*cg), GFP_KERNEL);
569 return ERR_PTR(-ENOMEM);
571 INIT_LIST_HEAD(&cg->rpools);
575 static void rdmacg_css_free(struct cgroup_subsys_state *css)
577 struct rdma_cgroup *cg = css_rdmacg(css);
583 * rdmacg_css_offline - cgroup css_offline callback
584 * @css: css of interest
586 * This function is called when @css is about to go away and responsible
587 * for shooting down all rdmacg associated with @css. As part of that it
588 * marks all the resource pool entries to max value, so that when resources are
589 * uncharged, associated resource pool can be freed as well.
591 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
593 struct rdma_cgroup *cg = css_rdmacg(css);
594 struct rdmacg_resource_pool *rpool;
596 mutex_lock(&rdmacg_mutex);
598 list_for_each_entry(rpool, &cg->rpools, cg_node)
599 set_all_resource_max_limit(rpool);
601 mutex_unlock(&rdmacg_mutex);
604 struct cgroup_subsys rdma_cgrp_subsys = {
605 .css_alloc = rdmacg_css_alloc,
606 .css_free = rdmacg_css_free,
607 .css_offline = rdmacg_css_offline,
608 .legacy_cftypes = rdmacg_files,
609 .dfl_cftypes = rdmacg_files,