~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/kernel/cgroup/rdma.c

Version: ~ [ linux-5.2-rc6 ] ~ [ linux-5.1.15 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.56 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.130 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.183 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.183 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.69 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.39.4 ] ~ [ linux-2.6.38.8 ] ~ [ linux-2.6.37.6 ] ~ [ linux-2.6.36.4 ] ~ [ linux-2.6.35.14 ] ~ [ linux-2.6.34.15 ] ~ [ linux-2.6.33.20 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /*
  2  * RDMA resource limiting controller for cgroups.
  3  *
  4  * Used to allow a cgroup hierarchy to stop processes from consuming
  5  * additional RDMA resources after a certain limit is reached.
  6  *
  7  * Copyright (C) 2016 Parav Pandit <pandit.parav@gmail.com>
  8  *
  9  * This file is subject to the terms and conditions of version 2 of the GNU
 10  * General Public License. See the file COPYING in the main directory of the
 11  * Linux distribution for more details.
 12  */
 13 
 14 #include <linux/bitops.h>
 15 #include <linux/slab.h>
 16 #include <linux/seq_file.h>
 17 #include <linux/cgroup.h>
 18 #include <linux/parser.h>
 19 #include <linux/cgroup_rdma.h>
 20 
 21 #define RDMACG_MAX_STR "max"
 22 
 23 /*
 24  * Protects list of resource pools maintained on per cgroup basis
 25  * and rdma device list.
 26  */
 27 static DEFINE_MUTEX(rdmacg_mutex);
 28 static LIST_HEAD(rdmacg_devices);
 29 
 30 enum rdmacg_file_type {
 31         RDMACG_RESOURCE_TYPE_MAX,
 32         RDMACG_RESOURCE_TYPE_STAT,
 33 };
 34 
 35 /*
 36  * resource table definition as to be seen by the user.
 37  * Need to add entries to it when more resources are
 38  * added/defined at IB verb/core layer.
 39  */
 40 static char const *rdmacg_resource_names[] = {
 41         [RDMACG_RESOURCE_HCA_HANDLE]    = "hca_handle",
 42         [RDMACG_RESOURCE_HCA_OBJECT]    = "hca_object",
 43 };
 44 
 45 /* resource tracker for each resource of rdma cgroup */
 46 struct rdmacg_resource {
 47         int max;
 48         int usage;
 49 };
 50 
 51 /*
 52  * resource pool object which represents per cgroup, per device
 53  * resources. There are multiple instances of this object per cgroup,
 54  * therefore it cannot be embedded within rdma_cgroup structure. It
 55  * is maintained as list.
 56  */
 57 struct rdmacg_resource_pool {
 58         struct rdmacg_device    *device;
 59         struct rdmacg_resource  resources[RDMACG_RESOURCE_MAX];
 60 
 61         struct list_head        cg_node;
 62         struct list_head        dev_node;
 63 
 64         /* count active user tasks of this pool */
 65         u64                     usage_sum;
 66         /* total number counts which are set to max */
 67         int                     num_max_cnt;
 68 };
 69 
 70 static struct rdma_cgroup *css_rdmacg(struct cgroup_subsys_state *css)
 71 {
 72         return container_of(css, struct rdma_cgroup, css);
 73 }
 74 
 75 static struct rdma_cgroup *parent_rdmacg(struct rdma_cgroup *cg)
 76 {
 77         return css_rdmacg(cg->css.parent);
 78 }
 79 
 80 static inline struct rdma_cgroup *get_current_rdmacg(void)
 81 {
 82         return css_rdmacg(task_get_css(current, rdma_cgrp_id));
 83 }
 84 
 85 static void set_resource_limit(struct rdmacg_resource_pool *rpool,
 86                                int index, int new_max)
 87 {
 88         if (new_max == S32_MAX) {
 89                 if (rpool->resources[index].max != S32_MAX)
 90                         rpool->num_max_cnt++;
 91         } else {
 92                 if (rpool->resources[index].max == S32_MAX)
 93                         rpool->num_max_cnt--;
 94         }
 95         rpool->resources[index].max = new_max;
 96 }
 97 
 98 static void set_all_resource_max_limit(struct rdmacg_resource_pool *rpool)
 99 {
100         int i;
101 
102         for (i = 0; i < RDMACG_RESOURCE_MAX; i++)
103                 set_resource_limit(rpool, i, S32_MAX);
104 }
105 
106 static void free_cg_rpool_locked(struct rdmacg_resource_pool *rpool)
107 {
108         lockdep_assert_held(&rdmacg_mutex);
109 
110         list_del(&rpool->cg_node);
111         list_del(&rpool->dev_node);
112         kfree(rpool);
113 }
114 
115 static struct rdmacg_resource_pool *
116 find_cg_rpool_locked(struct rdma_cgroup *cg,
117                      struct rdmacg_device *device)
118 
119 {
120         struct rdmacg_resource_pool *pool;
121 
122         lockdep_assert_held(&rdmacg_mutex);
123 
124         list_for_each_entry(pool, &cg->rpools, cg_node)
125                 if (pool->device == device)
126                         return pool;
127 
128         return NULL;
129 }
130 
131 static struct rdmacg_resource_pool *
132 get_cg_rpool_locked(struct rdma_cgroup *cg, struct rdmacg_device *device)
133 {
134         struct rdmacg_resource_pool *rpool;
135 
136         rpool = find_cg_rpool_locked(cg, device);
137         if (rpool)
138                 return rpool;
139 
140         rpool = kzalloc(sizeof(*rpool), GFP_KERNEL);
141         if (!rpool)
142                 return ERR_PTR(-ENOMEM);
143 
144         rpool->device = device;
145         set_all_resource_max_limit(rpool);
146 
147         INIT_LIST_HEAD(&rpool->cg_node);
148         INIT_LIST_HEAD(&rpool->dev_node);
149         list_add_tail(&rpool->cg_node, &cg->rpools);
150         list_add_tail(&rpool->dev_node, &device->rpools);
151         return rpool;
152 }
153 
154 /**
155  * uncharge_cg_locked - uncharge resource for rdma cgroup
156  * @cg: pointer to cg to uncharge and all parents in hierarchy
157  * @device: pointer to rdmacg device
158  * @index: index of the resource to uncharge in cg (resource pool)
159  *
160  * It also frees the resource pool which was created as part of
161  * charging operation when there are no resources attached to
162  * resource pool.
163  */
164 static void
165 uncharge_cg_locked(struct rdma_cgroup *cg,
166                    struct rdmacg_device *device,
167                    enum rdmacg_resource_type index)
168 {
169         struct rdmacg_resource_pool *rpool;
170 
171         rpool = find_cg_rpool_locked(cg, device);
172 
173         /*
174          * rpool cannot be null at this stage. Let kernel operate in case
175          * if there a bug in IB stack or rdma controller, instead of crashing
176          * the system.
177          */
178         if (unlikely(!rpool)) {
179                 pr_warn("Invalid device %p or rdma cgroup %p\n", cg, device);
180                 return;
181         }
182 
183         rpool->resources[index].usage--;
184 
185         /*
186          * A negative count (or overflow) is invalid,
187          * it indicates a bug in the rdma controller.
188          */
189         WARN_ON_ONCE(rpool->resources[index].usage < 0);
190         rpool->usage_sum--;
191         if (rpool->usage_sum == 0 &&
192             rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
193                 /*
194                  * No user of the rpool and all entries are set to max, so
195                  * safe to delete this rpool.
196                  */
197                 free_cg_rpool_locked(rpool);
198         }
199 }
200 
201 /**
202  * rdmacg_uncharge_hierarchy - hierarchically uncharge rdma resource count
203  * @device: pointer to rdmacg device
204  * @stop_cg: while traversing hirerchy, when meet with stop_cg cgroup
205  *           stop uncharging
206  * @index: index of the resource to uncharge in cg in given resource pool
207  */
208 static void rdmacg_uncharge_hierarchy(struct rdma_cgroup *cg,
209                                      struct rdmacg_device *device,
210                                      struct rdma_cgroup *stop_cg,
211                                      enum rdmacg_resource_type index)
212 {
213         struct rdma_cgroup *p;
214 
215         mutex_lock(&rdmacg_mutex);
216 
217         for (p = cg; p != stop_cg; p = parent_rdmacg(p))
218                 uncharge_cg_locked(p, device, index);
219 
220         mutex_unlock(&rdmacg_mutex);
221 
222         css_put(&cg->css);
223 }
224 
225 /**
226  * rdmacg_uncharge - hierarchically uncharge rdma resource count
227  * @device: pointer to rdmacg device
228  * @index: index of the resource to uncharge in cgroup in given resource pool
229  */
230 void rdmacg_uncharge(struct rdma_cgroup *cg,
231                      struct rdmacg_device *device,
232                      enum rdmacg_resource_type index)
233 {
234         if (index >= RDMACG_RESOURCE_MAX)
235                 return;
236 
237         rdmacg_uncharge_hierarchy(cg, device, NULL, index);
238 }
239 EXPORT_SYMBOL(rdmacg_uncharge);
240 
241 /**
242  * rdmacg_try_charge - hierarchically try to charge the rdma resource
243  * @rdmacg: pointer to rdma cgroup which will own this resource
244  * @device: pointer to rdmacg device
245  * @index: index of the resource to charge in cgroup (resource pool)
246  *
247  * This function follows charging resource in hierarchical way.
248  * It will fail if the charge would cause the new value to exceed the
249  * hierarchical limit.
250  * Returns 0 if the charge succeded, otherwise -EAGAIN, -ENOMEM or -EINVAL.
251  * Returns pointer to rdmacg for this resource when charging is successful.
252  *
253  * Charger needs to account resources on two criteria.
254  * (a) per cgroup & (b) per device resource usage.
255  * Per cgroup resource usage ensures that tasks of cgroup doesn't cross
256  * the configured limits. Per device provides granular configuration
257  * in multi device usage. It allocates resource pool in the hierarchy
258  * for each parent it come across for first resource. Later on resource
259  * pool will be available. Therefore it will be much faster thereon
260  * to charge/uncharge.
261  */
262 int rdmacg_try_charge(struct rdma_cgroup **rdmacg,
263                       struct rdmacg_device *device,
264                       enum rdmacg_resource_type index)
265 {
266         struct rdma_cgroup *cg, *p;
267         struct rdmacg_resource_pool *rpool;
268         s64 new;
269         int ret = 0;
270 
271         if (index >= RDMACG_RESOURCE_MAX)
272                 return -EINVAL;
273 
274         /*
275          * hold on to css, as cgroup can be removed but resource
276          * accounting happens on css.
277          */
278         cg = get_current_rdmacg();
279 
280         mutex_lock(&rdmacg_mutex);
281         for (p = cg; p; p = parent_rdmacg(p)) {
282                 rpool = get_cg_rpool_locked(p, device);
283                 if (IS_ERR(rpool)) {
284                         ret = PTR_ERR(rpool);
285                         goto err;
286                 } else {
287                         new = rpool->resources[index].usage + 1;
288                         if (new > rpool->resources[index].max) {
289                                 ret = -EAGAIN;
290                                 goto err;
291                         } else {
292                                 rpool->resources[index].usage = new;
293                                 rpool->usage_sum++;
294                         }
295                 }
296         }
297         mutex_unlock(&rdmacg_mutex);
298 
299         *rdmacg = cg;
300         return 0;
301 
302 err:
303         mutex_unlock(&rdmacg_mutex);
304         rdmacg_uncharge_hierarchy(cg, device, p, index);
305         return ret;
306 }
307 EXPORT_SYMBOL(rdmacg_try_charge);
308 
309 /**
310  * rdmacg_register_device - register rdmacg device to rdma controller.
311  * @device: pointer to rdmacg device whose resources need to be accounted.
312  *
313  * If IB stack wish a device to participate in rdma cgroup resource
314  * tracking, it must invoke this API to register with rdma cgroup before
315  * any user space application can start using the RDMA resources.
316  * Returns 0 on success or EINVAL when table length given is beyond
317  * supported size.
318  */
319 int rdmacg_register_device(struct rdmacg_device *device)
320 {
321         INIT_LIST_HEAD(&device->dev_node);
322         INIT_LIST_HEAD(&device->rpools);
323 
324         mutex_lock(&rdmacg_mutex);
325         list_add_tail(&device->dev_node, &rdmacg_devices);
326         mutex_unlock(&rdmacg_mutex);
327         return 0;
328 }
329 EXPORT_SYMBOL(rdmacg_register_device);
330 
331 /**
332  * rdmacg_unregister_device - unregister rdmacg device from rdma controller.
333  * @device: pointer to rdmacg device which was previously registered with rdma
334  *          controller using rdmacg_register_device().
335  *
336  * IB stack must invoke this after all the resources of the IB device
337  * are destroyed and after ensuring that no more resources will be created
338  * when this API is invoked.
339  */
340 void rdmacg_unregister_device(struct rdmacg_device *device)
341 {
342         struct rdmacg_resource_pool *rpool, *tmp;
343 
344         /*
345          * Synchronize with any active resource settings,
346          * usage query happening via configfs.
347          */
348         mutex_lock(&rdmacg_mutex);
349         list_del_init(&device->dev_node);
350 
351         /*
352          * Now that this device is off the cgroup list, its safe to free
353          * all the rpool resources.
354          */
355         list_for_each_entry_safe(rpool, tmp, &device->rpools, dev_node)
356                 free_cg_rpool_locked(rpool);
357 
358         mutex_unlock(&rdmacg_mutex);
359 }
360 EXPORT_SYMBOL(rdmacg_unregister_device);
361 
362 static int parse_resource(char *c, int *intval)
363 {
364         substring_t argstr;
365         const char **table = &rdmacg_resource_names[0];
366         char *name, *value = c;
367         size_t len;
368         int ret, i = 0;
369 
370         name = strsep(&value, "=");
371         if (!name || !value)
372                 return -EINVAL;
373 
374         len = strlen(value);
375 
376         for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
377                 if (strcmp(table[i], name))
378                         continue;
379 
380                 argstr.from = value;
381                 argstr.to = value + len;
382 
383                 ret = match_int(&argstr, intval);
384                 if (ret >= 0) {
385                         if (*intval < 0)
386                                 break;
387                         return i;
388                 }
389                 if (strncmp(value, RDMACG_MAX_STR, len) == 0) {
390                         *intval = S32_MAX;
391                         return i;
392                 }
393                 break;
394         }
395         return -EINVAL;
396 }
397 
398 static int rdmacg_parse_limits(char *options,
399                                int *new_limits, unsigned long *enables)
400 {
401         char *c;
402         int err = -EINVAL;
403 
404         /* parse resource options */
405         while ((c = strsep(&options, " ")) != NULL) {
406                 int index, intval;
407 
408                 index = parse_resource(c, &intval);
409                 if (index < 0)
410                         goto err;
411 
412                 new_limits[index] = intval;
413                 *enables |= BIT(index);
414         }
415         return 0;
416 
417 err:
418         return err;
419 }
420 
421 static struct rdmacg_device *rdmacg_get_device_locked(const char *name)
422 {
423         struct rdmacg_device *device;
424 
425         lockdep_assert_held(&rdmacg_mutex);
426 
427         list_for_each_entry(device, &rdmacg_devices, dev_node)
428                 if (!strcmp(name, device->name))
429                         return device;
430 
431         return NULL;
432 }
433 
434 static ssize_t rdmacg_resource_set_max(struct kernfs_open_file *of,
435                                        char *buf, size_t nbytes, loff_t off)
436 {
437         struct rdma_cgroup *cg = css_rdmacg(of_css(of));
438         const char *dev_name;
439         struct rdmacg_resource_pool *rpool;
440         struct rdmacg_device *device;
441         char *options = strstrip(buf);
442         int *new_limits;
443         unsigned long enables = 0;
444         int i = 0, ret = 0;
445 
446         /* extract the device name first */
447         dev_name = strsep(&options, " ");
448         if (!dev_name) {
449                 ret = -EINVAL;
450                 goto err;
451         }
452 
453         new_limits = kcalloc(RDMACG_RESOURCE_MAX, sizeof(int), GFP_KERNEL);
454         if (!new_limits) {
455                 ret = -ENOMEM;
456                 goto err;
457         }
458 
459         ret = rdmacg_parse_limits(options, new_limits, &enables);
460         if (ret)
461                 goto parse_err;
462 
463         /* acquire lock to synchronize with hot plug devices */
464         mutex_lock(&rdmacg_mutex);
465 
466         device = rdmacg_get_device_locked(dev_name);
467         if (!device) {
468                 ret = -ENODEV;
469                 goto dev_err;
470         }
471 
472         rpool = get_cg_rpool_locked(cg, device);
473         if (IS_ERR(rpool)) {
474                 ret = PTR_ERR(rpool);
475                 goto dev_err;
476         }
477 
478         /* now set the new limits of the rpool */
479         for_each_set_bit(i, &enables, RDMACG_RESOURCE_MAX)
480                 set_resource_limit(rpool, i, new_limits[i]);
481 
482         if (rpool->usage_sum == 0 &&
483             rpool->num_max_cnt == RDMACG_RESOURCE_MAX) {
484                 /*
485                  * No user of the rpool and all entries are set to max, so
486                  * safe to delete this rpool.
487                  */
488                 free_cg_rpool_locked(rpool);
489         }
490 
491 dev_err:
492         mutex_unlock(&rdmacg_mutex);
493 
494 parse_err:
495         kfree(new_limits);
496 
497 err:
498         return ret ?: nbytes;
499 }
500 
501 static void print_rpool_values(struct seq_file *sf,
502                                struct rdmacg_resource_pool *rpool)
503 {
504         enum rdmacg_file_type sf_type;
505         int i;
506         u32 value;
507 
508         sf_type = seq_cft(sf)->private;
509 
510         for (i = 0; i < RDMACG_RESOURCE_MAX; i++) {
511                 seq_puts(sf, rdmacg_resource_names[i]);
512                 seq_putc(sf, '=');
513                 if (sf_type == RDMACG_RESOURCE_TYPE_MAX) {
514                         if (rpool)
515                                 value = rpool->resources[i].max;
516                         else
517                                 value = S32_MAX;
518                 } else {
519                         if (rpool)
520                                 value = rpool->resources[i].usage;
521                         else
522                                 value = 0;
523                 }
524 
525                 if (value == S32_MAX)
526                         seq_puts(sf, RDMACG_MAX_STR);
527                 else
528                         seq_printf(sf, "%d", value);
529                 seq_putc(sf, ' ');
530         }
531 }
532 
533 static int rdmacg_resource_read(struct seq_file *sf, void *v)
534 {
535         struct rdmacg_device *device;
536         struct rdmacg_resource_pool *rpool;
537         struct rdma_cgroup *cg = css_rdmacg(seq_css(sf));
538 
539         mutex_lock(&rdmacg_mutex);
540 
541         list_for_each_entry(device, &rdmacg_devices, dev_node) {
542                 seq_printf(sf, "%s ", device->name);
543 
544                 rpool = find_cg_rpool_locked(cg, device);
545                 print_rpool_values(sf, rpool);
546 
547                 seq_putc(sf, '\n');
548         }
549 
550         mutex_unlock(&rdmacg_mutex);
551         return 0;
552 }
553 
554 static struct cftype rdmacg_files[] = {
555         {
556                 .name = "max",
557                 .write = rdmacg_resource_set_max,
558                 .seq_show = rdmacg_resource_read,
559                 .private = RDMACG_RESOURCE_TYPE_MAX,
560                 .flags = CFTYPE_NOT_ON_ROOT,
561         },
562         {
563                 .name = "current",
564                 .seq_show = rdmacg_resource_read,
565                 .private = RDMACG_RESOURCE_TYPE_STAT,
566                 .flags = CFTYPE_NOT_ON_ROOT,
567         },
568         { }     /* terminate */
569 };
570 
571 static struct cgroup_subsys_state *
572 rdmacg_css_alloc(struct cgroup_subsys_state *parent)
573 {
574         struct rdma_cgroup *cg;
575 
576         cg = kzalloc(sizeof(*cg), GFP_KERNEL);
577         if (!cg)
578                 return ERR_PTR(-ENOMEM);
579 
580         INIT_LIST_HEAD(&cg->rpools);
581         return &cg->css;
582 }
583 
584 static void rdmacg_css_free(struct cgroup_subsys_state *css)
585 {
586         struct rdma_cgroup *cg = css_rdmacg(css);
587 
588         kfree(cg);
589 }
590 
591 /**
592  * rdmacg_css_offline - cgroup css_offline callback
593  * @css: css of interest
594  *
595  * This function is called when @css is about to go away and responsible
596  * for shooting down all rdmacg associated with @css. As part of that it
597  * marks all the resource pool entries to max value, so that when resources are
598  * uncharged, associated resource pool can be freed as well.
599  */
600 static void rdmacg_css_offline(struct cgroup_subsys_state *css)
601 {
602         struct rdma_cgroup *cg = css_rdmacg(css);
603         struct rdmacg_resource_pool *rpool;
604 
605         mutex_lock(&rdmacg_mutex);
606 
607         list_for_each_entry(rpool, &cg->rpools, cg_node)
608                 set_all_resource_max_limit(rpool);
609 
610         mutex_unlock(&rdmacg_mutex);
611 }
612 
613 struct cgroup_subsys rdma_cgrp_subsys = {
614         .css_alloc      = rdmacg_css_alloc,
615         .css_free       = rdmacg_css_free,
616         .css_offline    = rdmacg_css_offline,
617         .legacy_cftypes = rdmacg_files,
618         .dfl_cftypes    = rdmacg_files,
619 };
620 

~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

kernel.org | git.kernel.org | LWN.net | Project Home | Wiki (Japanese) | Wiki (English) | SVN repository | Mail admin

Linux® is a registered trademark of Linus Torvalds in the United States and other countries.
TOMOYO® is a registered trademark of NTT DATA CORPORATION.

osdn.jp