~ [ source navigation ] ~ [ diff markup ] ~ [ identifier search ] ~

TOMOYO Linux Cross Reference
Linux/mm/memcontrol.c

Version: ~ [ linux-5.6 ] ~ [ linux-5.5.13 ] ~ [ linux-5.4.28 ] ~ [ linux-5.3.18 ] ~ [ linux-5.2.21 ] ~ [ linux-5.1.21 ] ~ [ linux-5.0.21 ] ~ [ linux-4.20.17 ] ~ [ linux-4.19.113 ] ~ [ linux-4.18.20 ] ~ [ linux-4.17.19 ] ~ [ linux-4.16.18 ] ~ [ linux-4.15.18 ] ~ [ linux-4.14.174 ] ~ [ linux-4.13.16 ] ~ [ linux-4.12.14 ] ~ [ linux-4.11.12 ] ~ [ linux-4.10.17 ] ~ [ linux-4.9.217 ] ~ [ linux-4.8.17 ] ~ [ linux-4.7.10 ] ~ [ linux-4.6.7 ] ~ [ linux-4.5.7 ] ~ [ linux-4.4.217 ] ~ [ linux-4.3.6 ] ~ [ linux-4.2.8 ] ~ [ linux-4.1.52 ] ~ [ linux-4.0.9 ] ~ [ linux-3.19.8 ] ~ [ linux-3.18.140 ] ~ [ linux-3.17.8 ] ~ [ linux-3.16.82 ] ~ [ linux-3.15.10 ] ~ [ linux-3.14.79 ] ~ [ linux-3.13.11 ] ~ [ linux-3.12.74 ] ~ [ linux-3.11.10 ] ~ [ linux-3.10.108 ] ~ [ linux-3.9.11 ] ~ [ linux-3.8.13 ] ~ [ linux-3.7.10 ] ~ [ linux-3.6.11 ] ~ [ linux-3.5.7 ] ~ [ linux-3.4.113 ] ~ [ linux-3.3.8 ] ~ [ linux-3.2.102 ] ~ [ linux-3.1.10 ] ~ [ linux-3.0.101 ] ~ [ linux-2.6.32.71 ] ~ [ linux-2.6.0 ] ~ [ linux-2.4.37.11 ] ~ [ unix-v6-master ] ~ [ ccs-tools-1.8.5 ] ~ [ policy-sample ] ~
Architecture: ~ [ i386 ] ~ [ alpha ] ~ [ m68k ] ~ [ mips ] ~ [ ppc ] ~ [ sparc ] ~ [ sparc64 ] ~

  1 /* memcontrol.c - Memory Controller
  2  *
  3  * Copyright IBM Corporation, 2007
  4  * Author Balbir Singh <balbir@linux.vnet.ibm.com>
  5  *
  6  * Copyright 2007 OpenVZ SWsoft Inc
  7  * Author: Pavel Emelianov <xemul@openvz.org>
  8  *
  9  * Memory thresholds
 10  * Copyright (C) 2009 Nokia Corporation
 11  * Author: Kirill A. Shutemov
 12  *
 13  * Kernel Memory Controller
 14  * Copyright (C) 2012 Parallels Inc. and Google Inc.
 15  * Authors: Glauber Costa and Suleiman Souhlal
 16  *
 17  * Native page reclaim
 18  * Charge lifetime sanitation
 19  * Lockless page tracking & accounting
 20  * Unified hierarchy configuration model
 21  * Copyright (C) 2015 Red Hat, Inc., Johannes Weiner
 22  *
 23  * This program is free software; you can redistribute it and/or modify
 24  * it under the terms of the GNU General Public License as published by
 25  * the Free Software Foundation; either version 2 of the License, or
 26  * (at your option) any later version.
 27  *
 28  * This program is distributed in the hope that it will be useful,
 29  * but WITHOUT ANY WARRANTY; without even the implied warranty of
 30  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 31  * GNU General Public License for more details.
 32  */
 33 
 34 #include <linux/page_counter.h>
 35 #include <linux/memcontrol.h>
 36 #include <linux/cgroup.h>
 37 #include <linux/mm.h>
 38 #include <linux/sched/mm.h>
 39 #include <linux/shmem_fs.h>
 40 #include <linux/hugetlb.h>
 41 #include <linux/pagemap.h>
 42 #include <linux/smp.h>
 43 #include <linux/page-flags.h>
 44 #include <linux/backing-dev.h>
 45 #include <linux/bit_spinlock.h>
 46 #include <linux/rcupdate.h>
 47 #include <linux/limits.h>
 48 #include <linux/export.h>
 49 #include <linux/mutex.h>
 50 #include <linux/rbtree.h>
 51 #include <linux/slab.h>
 52 #include <linux/swap.h>
 53 #include <linux/swapops.h>
 54 #include <linux/spinlock.h>
 55 #include <linux/eventfd.h>
 56 #include <linux/poll.h>
 57 #include <linux/sort.h>
 58 #include <linux/fs.h>
 59 #include <linux/seq_file.h>
 60 #include <linux/vmpressure.h>
 61 #include <linux/mm_inline.h>
 62 #include <linux/swap_cgroup.h>
 63 #include <linux/cpu.h>
 64 #include <linux/oom.h>
 65 #include <linux/lockdep.h>
 66 #include <linux/file.h>
 67 #include <linux/tracehook.h>
 68 #include "internal.h"
 69 #include <net/sock.h>
 70 #include <net/ip.h>
 71 #include "slab.h"
 72 
 73 #include <linux/uaccess.h>
 74 
 75 #include <trace/events/vmscan.h>
 76 
 77 struct cgroup_subsys memory_cgrp_subsys __read_mostly;
 78 EXPORT_SYMBOL(memory_cgrp_subsys);
 79 
 80 struct mem_cgroup *root_mem_cgroup __read_mostly;
 81 
 82 #define MEM_CGROUP_RECLAIM_RETRIES      5
 83 
 84 /* Socket memory accounting disabled? */
 85 static bool cgroup_memory_nosocket;
 86 
 87 /* Kernel memory accounting disabled? */
 88 static bool cgroup_memory_nokmem;
 89 
 90 /* Whether the swap controller is active */
 91 #ifdef CONFIG_MEMCG_SWAP
 92 int do_swap_account __read_mostly;
 93 #else
 94 #define do_swap_account         0
 95 #endif
 96 
 97 /* Whether legacy memory+swap accounting is active */
 98 static bool do_memsw_account(void)
 99 {
100         return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && do_swap_account;
101 }
102 
103 static const char *const mem_cgroup_lru_names[] = {
104         "inactive_anon",
105         "active_anon",
106         "inactive_file",
107         "active_file",
108         "unevictable",
109 };
110 
111 #define THRESHOLDS_EVENTS_TARGET 128
112 #define SOFTLIMIT_EVENTS_TARGET 1024
113 #define NUMAINFO_EVENTS_TARGET  1024
114 
115 /*
116  * Cgroups above their limits are maintained in a RB-Tree, independent of
117  * their hierarchy representation
118  */
119 
120 struct mem_cgroup_tree_per_node {
121         struct rb_root rb_root;
122         struct rb_node *rb_rightmost;
123         spinlock_t lock;
124 };
125 
126 struct mem_cgroup_tree {
127         struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES];
128 };
129 
130 static struct mem_cgroup_tree soft_limit_tree __read_mostly;
131 
132 /* for OOM */
133 struct mem_cgroup_eventfd_list {
134         struct list_head list;
135         struct eventfd_ctx *eventfd;
136 };
137 
138 /*
139  * cgroup_event represents events which userspace want to receive.
140  */
141 struct mem_cgroup_event {
142         /*
143          * memcg which the event belongs to.
144          */
145         struct mem_cgroup *memcg;
146         /*
147          * eventfd to signal userspace about the event.
148          */
149         struct eventfd_ctx *eventfd;
150         /*
151          * Each of these stored in a list by the cgroup.
152          */
153         struct list_head list;
154         /*
155          * register_event() callback will be used to add new userspace
156          * waiter for changes related to this event.  Use eventfd_signal()
157          * on eventfd to send notification to userspace.
158          */
159         int (*register_event)(struct mem_cgroup *memcg,
160                               struct eventfd_ctx *eventfd, const char *args);
161         /*
162          * unregister_event() callback will be called when userspace closes
163          * the eventfd or on cgroup removing.  This callback must be set,
164          * if you want provide notification functionality.
165          */
166         void (*unregister_event)(struct mem_cgroup *memcg,
167                                  struct eventfd_ctx *eventfd);
168         /*
169          * All fields below needed to unregister event when
170          * userspace closes eventfd.
171          */
172         poll_table pt;
173         wait_queue_head_t *wqh;
174         wait_queue_entry_t wait;
175         struct work_struct remove;
176 };
177 
178 static void mem_cgroup_threshold(struct mem_cgroup *memcg);
179 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg);
180 
181 /* Stuffs for move charges at task migration. */
182 /*
183  * Types of charges to be moved.
184  */
185 #define MOVE_ANON       0x1U
186 #define MOVE_FILE       0x2U
187 #define MOVE_MASK       (MOVE_ANON | MOVE_FILE)
188 
189 /* "mc" and its members are protected by cgroup_mutex */
190 static struct move_charge_struct {
191         spinlock_t        lock; /* for from, to */
192         struct mm_struct  *mm;
193         struct mem_cgroup *from;
194         struct mem_cgroup *to;
195         unsigned long flags;
196         unsigned long precharge;
197         unsigned long moved_charge;
198         unsigned long moved_swap;
199         struct task_struct *moving_task;        /* a task moving charges */
200         wait_queue_head_t waitq;                /* a waitq for other context */
201 } mc = {
202         .lock = __SPIN_LOCK_UNLOCKED(mc.lock),
203         .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq),
204 };
205 
206 /*
207  * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft
208  * limit reclaim to prevent infinite loops, if they ever occur.
209  */
210 #define MEM_CGROUP_MAX_RECLAIM_LOOPS            100
211 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2
212 
213 enum charge_type {
214         MEM_CGROUP_CHARGE_TYPE_CACHE = 0,
215         MEM_CGROUP_CHARGE_TYPE_ANON,
216         MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */
217         MEM_CGROUP_CHARGE_TYPE_DROP,    /* a page was unused swap cache */
218         NR_CHARGE_TYPE,
219 };
220 
221 /* for encoding cft->private value on file */
222 enum res_type {
223         _MEM,
224         _MEMSWAP,
225         _OOM_TYPE,
226         _KMEM,
227         _TCP,
228 };
229 
230 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val))
231 #define MEMFILE_TYPE(val)       ((val) >> 16 & 0xffff)
232 #define MEMFILE_ATTR(val)       ((val) & 0xffff)
233 /* Used for OOM nofiier */
234 #define OOM_CONTROL             (0)
235 
236 /*
237  * Iteration constructs for visiting all cgroups (under a tree).  If
238  * loops are exited prematurely (break), mem_cgroup_iter_break() must
239  * be used for reference counting.
240  */
241 #define for_each_mem_cgroup_tree(iter, root)            \
242         for (iter = mem_cgroup_iter(root, NULL, NULL);  \
243              iter != NULL;                              \
244              iter = mem_cgroup_iter(root, iter, NULL))
245 
246 #define for_each_mem_cgroup(iter)                       \
247         for (iter = mem_cgroup_iter(NULL, NULL, NULL);  \
248              iter != NULL;                              \
249              iter = mem_cgroup_iter(NULL, iter, NULL))
250 
251 /* Some nice accessors for the vmpressure. */
252 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg)
253 {
254         if (!memcg)
255                 memcg = root_mem_cgroup;
256         return &memcg->vmpressure;
257 }
258 
259 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr)
260 {
261         return &container_of(vmpr, struct mem_cgroup, vmpressure)->css;
262 }
263 
264 #ifdef CONFIG_MEMCG_KMEM
265 /*
266  * This will be the memcg's index in each cache's ->memcg_params.memcg_caches.
267  * The main reason for not using cgroup id for this:
268  *  this works better in sparse environments, where we have a lot of memcgs,
269  *  but only a few kmem-limited. Or also, if we have, for instance, 200
270  *  memcgs, and none but the 200th is kmem-limited, we'd have to have a
271  *  200 entry array for that.
272  *
273  * The current size of the caches array is stored in memcg_nr_cache_ids. It
274  * will double each time we have to increase it.
275  */
276 static DEFINE_IDA(memcg_cache_ida);
277 int memcg_nr_cache_ids;
278 
279 /* Protects memcg_nr_cache_ids */
280 static DECLARE_RWSEM(memcg_cache_ids_sem);
281 
282 void memcg_get_cache_ids(void)
283 {
284         down_read(&memcg_cache_ids_sem);
285 }
286 
287 void memcg_put_cache_ids(void)
288 {
289         up_read(&memcg_cache_ids_sem);
290 }
291 
292 /*
293  * MIN_SIZE is different than 1, because we would like to avoid going through
294  * the alloc/free process all the time. In a small machine, 4 kmem-limited
295  * cgroups is a reasonable guess. In the future, it could be a parameter or
296  * tunable, but that is strictly not necessary.
297  *
298  * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get
299  * this constant directly from cgroup, but it is understandable that this is
300  * better kept as an internal representation in cgroup.c. In any case, the
301  * cgrp_id space is not getting any smaller, and we don't have to necessarily
302  * increase ours as well if it increases.
303  */
304 #define MEMCG_CACHES_MIN_SIZE 4
305 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX
306 
307 /*
308  * A lot of the calls to the cache allocation functions are expected to be
309  * inlined by the compiler. Since the calls to memcg_kmem_get_cache are
310  * conditional to this static branch, we'll have to allow modules that does
311  * kmem_cache_alloc and the such to see this symbol as well
312  */
313 DEFINE_STATIC_KEY_FALSE(memcg_kmem_enabled_key);
314 EXPORT_SYMBOL(memcg_kmem_enabled_key);
315 
316 struct workqueue_struct *memcg_kmem_cache_wq;
317 
318 static int memcg_shrinker_map_size;
319 static DEFINE_MUTEX(memcg_shrinker_map_mutex);
320 
321 static void memcg_free_shrinker_map_rcu(struct rcu_head *head)
322 {
323         kvfree(container_of(head, struct memcg_shrinker_map, rcu));
324 }
325 
326 static int memcg_expand_one_shrinker_map(struct mem_cgroup *memcg,
327                                          int size, int old_size)
328 {
329         struct memcg_shrinker_map *new, *old;
330         int nid;
331 
332         lockdep_assert_held(&memcg_shrinker_map_mutex);
333 
334         for_each_node(nid) {
335                 old = rcu_dereference_protected(
336                         mem_cgroup_nodeinfo(memcg, nid)->shrinker_map, true);
337                 /* Not yet online memcg */
338                 if (!old)
339                         return 0;
340 
341                 new = kvmalloc(sizeof(*new) + size, GFP_KERNEL);
342                 if (!new)
343                         return -ENOMEM;
344 
345                 /* Set all old bits, clear all new bits */
346                 memset(new->map, (int)0xff, old_size);
347                 memset((void *)new->map + old_size, 0, size - old_size);
348 
349                 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, new);
350                 call_rcu(&old->rcu, memcg_free_shrinker_map_rcu);
351         }
352 
353         return 0;
354 }
355 
356 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg)
357 {
358         struct mem_cgroup_per_node *pn;
359         struct memcg_shrinker_map *map;
360         int nid;
361 
362         if (mem_cgroup_is_root(memcg))
363                 return;
364 
365         for_each_node(nid) {
366                 pn = mem_cgroup_nodeinfo(memcg, nid);
367                 map = rcu_dereference_protected(pn->shrinker_map, true);
368                 if (map)
369                         kvfree(map);
370                 rcu_assign_pointer(pn->shrinker_map, NULL);
371         }
372 }
373 
374 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
375 {
376         struct memcg_shrinker_map *map;
377         int nid, size, ret = 0;
378 
379         if (mem_cgroup_is_root(memcg))
380                 return 0;
381 
382         mutex_lock(&memcg_shrinker_map_mutex);
383         size = memcg_shrinker_map_size;
384         for_each_node(nid) {
385                 map = kvzalloc(sizeof(*map) + size, GFP_KERNEL);
386                 if (!map) {
387                         memcg_free_shrinker_maps(memcg);
388                         ret = -ENOMEM;
389                         break;
390                 }
391                 rcu_assign_pointer(memcg->nodeinfo[nid]->shrinker_map, map);
392         }
393         mutex_unlock(&memcg_shrinker_map_mutex);
394 
395         return ret;
396 }
397 
398 int memcg_expand_shrinker_maps(int new_id)
399 {
400         int size, old_size, ret = 0;
401         struct mem_cgroup *memcg;
402 
403         size = DIV_ROUND_UP(new_id + 1, BITS_PER_LONG) * sizeof(unsigned long);
404         old_size = memcg_shrinker_map_size;
405         if (size <= old_size)
406                 return 0;
407 
408         mutex_lock(&memcg_shrinker_map_mutex);
409         if (!root_mem_cgroup)
410                 goto unlock;
411 
412         for_each_mem_cgroup(memcg) {
413                 if (mem_cgroup_is_root(memcg))
414                         continue;
415                 ret = memcg_expand_one_shrinker_map(memcg, size, old_size);
416                 if (ret)
417                         goto unlock;
418         }
419 unlock:
420         if (!ret)
421                 memcg_shrinker_map_size = size;
422         mutex_unlock(&memcg_shrinker_map_mutex);
423         return ret;
424 }
425 
426 void memcg_set_shrinker_bit(struct mem_cgroup *memcg, int nid, int shrinker_id)
427 {
428         if (shrinker_id >= 0 && memcg && !mem_cgroup_is_root(memcg)) {
429                 struct memcg_shrinker_map *map;
430 
431                 rcu_read_lock();
432                 map = rcu_dereference(memcg->nodeinfo[nid]->shrinker_map);
433                 /* Pairs with smp mb in shrink_slab() */
434                 smp_mb__before_atomic();
435                 set_bit(shrinker_id, map->map);
436                 rcu_read_unlock();
437         }
438 }
439 
440 #else /* CONFIG_MEMCG_KMEM */
441 static int memcg_alloc_shrinker_maps(struct mem_cgroup *memcg)
442 {
443         return 0;
444 }
445 static void memcg_free_shrinker_maps(struct mem_cgroup *memcg) { }
446 #endif /* CONFIG_MEMCG_KMEM */
447 
448 /**
449  * mem_cgroup_css_from_page - css of the memcg associated with a page
450  * @page: page of interest
451  *
452  * If memcg is bound to the default hierarchy, css of the memcg associated
453  * with @page is returned.  The returned css remains associated with @page
454  * until it is released.
455  *
456  * If memcg is bound to a traditional hierarchy, the css of root_mem_cgroup
457  * is returned.
458  */
459 struct cgroup_subsys_state *mem_cgroup_css_from_page(struct page *page)
460 {
461         struct mem_cgroup *memcg;
462 
463         memcg = page->mem_cgroup;
464 
465         if (!memcg || !cgroup_subsys_on_dfl(memory_cgrp_subsys))
466                 memcg = root_mem_cgroup;
467 
468         return &memcg->css;
469 }
470 
471 /**
472  * page_cgroup_ino - return inode number of the memcg a page is charged to
473  * @page: the page
474  *
475  * Look up the closest online ancestor of the memory cgroup @page is charged to
476  * and return its inode number or 0 if @page is not charged to any cgroup. It
477  * is safe to call this function without holding a reference to @page.
478  *
479  * Note, this function is inherently racy, because there is nothing to prevent
480  * the cgroup inode from getting torn down and potentially reallocated a moment
481  * after page_cgroup_ino() returns, so it only should be used by callers that
482  * do not care (such as procfs interfaces).
483  */
484 ino_t page_cgroup_ino(struct page *page)
485 {
486         struct mem_cgroup *memcg;
487         unsigned long ino = 0;
488 
489         rcu_read_lock();
490         memcg = READ_ONCE(page->mem_cgroup);
491         while (memcg && !(memcg->css.flags & CSS_ONLINE))
492                 memcg = parent_mem_cgroup(memcg);
493         if (memcg)
494                 ino = cgroup_ino(memcg->css.cgroup);
495         rcu_read_unlock();
496         return ino;
497 }
498 
499 static struct mem_cgroup_per_node *
500 mem_cgroup_page_nodeinfo(struct mem_cgroup *memcg, struct page *page)
501 {
502         int nid = page_to_nid(page);
503 
504         return memcg->nodeinfo[nid];
505 }
506 
507 static struct mem_cgroup_tree_per_node *
508 soft_limit_tree_node(int nid)
509 {
510         return soft_limit_tree.rb_tree_per_node[nid];
511 }
512 
513 static struct mem_cgroup_tree_per_node *
514 soft_limit_tree_from_page(struct page *page)
515 {
516         int nid = page_to_nid(page);
517 
518         return soft_limit_tree.rb_tree_per_node[nid];
519 }
520 
521 static void __mem_cgroup_insert_exceeded(struct mem_cgroup_per_node *mz,
522                                          struct mem_cgroup_tree_per_node *mctz,
523                                          unsigned long new_usage_in_excess)
524 {
525         struct rb_node **p = &mctz->rb_root.rb_node;
526         struct rb_node *parent = NULL;
527         struct mem_cgroup_per_node *mz_node;
528         bool rightmost = true;
529 
530         if (mz->on_tree)
531                 return;
532 
533         mz->usage_in_excess = new_usage_in_excess;
534         if (!mz->usage_in_excess)
535                 return;
536         while (*p) {
537                 parent = *p;
538                 mz_node = rb_entry(parent, struct mem_cgroup_per_node,
539                                         tree_node);
540                 if (mz->usage_in_excess < mz_node->usage_in_excess) {
541                         p = &(*p)->rb_left;
542                         rightmost = false;
543                 }
544 
545                 /*
546                  * We can't avoid mem cgroups that are over their soft
547                  * limit by the same amount
548                  */
549                 else if (mz->usage_in_excess >= mz_node->usage_in_excess)
550                         p = &(*p)->rb_right;
551         }
552 
553         if (rightmost)
554                 mctz->rb_rightmost = &mz->tree_node;
555 
556         rb_link_node(&mz->tree_node, parent, p);
557         rb_insert_color(&mz->tree_node, &mctz->rb_root);
558         mz->on_tree = true;
559 }
560 
561 static void __mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
562                                          struct mem_cgroup_tree_per_node *mctz)
563 {
564         if (!mz->on_tree)
565                 return;
566 
567         if (&mz->tree_node == mctz->rb_rightmost)
568                 mctz->rb_rightmost = rb_prev(&mz->tree_node);
569 
570         rb_erase(&mz->tree_node, &mctz->rb_root);
571         mz->on_tree = false;
572 }
573 
574 static void mem_cgroup_remove_exceeded(struct mem_cgroup_per_node *mz,
575                                        struct mem_cgroup_tree_per_node *mctz)
576 {
577         unsigned long flags;
578 
579         spin_lock_irqsave(&mctz->lock, flags);
580         __mem_cgroup_remove_exceeded(mz, mctz);
581         spin_unlock_irqrestore(&mctz->lock, flags);
582 }
583 
584 static unsigned long soft_limit_excess(struct mem_cgroup *memcg)
585 {
586         unsigned long nr_pages = page_counter_read(&memcg->memory);
587         unsigned long soft_limit = READ_ONCE(memcg->soft_limit);
588         unsigned long excess = 0;
589 
590         if (nr_pages > soft_limit)
591                 excess = nr_pages - soft_limit;
592 
593         return excess;
594 }
595 
596 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page)
597 {
598         unsigned long excess;
599         struct mem_cgroup_per_node *mz;
600         struct mem_cgroup_tree_per_node *mctz;
601 
602         mctz = soft_limit_tree_from_page(page);
603         if (!mctz)
604                 return;
605         /*
606          * Necessary to update all ancestors when hierarchy is used.
607          * because their event counter is not touched.
608          */
609         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
610                 mz = mem_cgroup_page_nodeinfo(memcg, page);
611                 excess = soft_limit_excess(memcg);
612                 /*
613                  * We have to update the tree if mz is on RB-tree or
614                  * mem is over its softlimit.
615                  */
616                 if (excess || mz->on_tree) {
617                         unsigned long flags;
618 
619                         spin_lock_irqsave(&mctz->lock, flags);
620                         /* if on-tree, remove it */
621                         if (mz->on_tree)
622                                 __mem_cgroup_remove_exceeded(mz, mctz);
623                         /*
624                          * Insert again. mz->usage_in_excess will be updated.
625                          * If excess is 0, no tree ops.
626                          */
627                         __mem_cgroup_insert_exceeded(mz, mctz, excess);
628                         spin_unlock_irqrestore(&mctz->lock, flags);
629                 }
630         }
631 }
632 
633 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg)
634 {
635         struct mem_cgroup_tree_per_node *mctz;
636         struct mem_cgroup_per_node *mz;
637         int nid;
638 
639         for_each_node(nid) {
640                 mz = mem_cgroup_nodeinfo(memcg, nid);
641                 mctz = soft_limit_tree_node(nid);
642                 if (mctz)
643                         mem_cgroup_remove_exceeded(mz, mctz);
644         }
645 }
646 
647 static struct mem_cgroup_per_node *
648 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
649 {
650         struct mem_cgroup_per_node *mz;
651 
652 retry:
653         mz = NULL;
654         if (!mctz->rb_rightmost)
655                 goto done;              /* Nothing to reclaim from */
656 
657         mz = rb_entry(mctz->rb_rightmost,
658                       struct mem_cgroup_per_node, tree_node);
659         /*
660          * Remove the node now but someone else can add it back,
661          * we will to add it back at the end of reclaim to its correct
662          * position in the tree.
663          */
664         __mem_cgroup_remove_exceeded(mz, mctz);
665         if (!soft_limit_excess(mz->memcg) ||
666             !css_tryget_online(&mz->memcg->css))
667                 goto retry;
668 done:
669         return mz;
670 }
671 
672 static struct mem_cgroup_per_node *
673 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz)
674 {
675         struct mem_cgroup_per_node *mz;
676 
677         spin_lock_irq(&mctz->lock);
678         mz = __mem_cgroup_largest_soft_limit_node(mctz);
679         spin_unlock_irq(&mctz->lock);
680         return mz;
681 }
682 
683 static unsigned long memcg_sum_events(struct mem_cgroup *memcg,
684                                       int event)
685 {
686         return atomic_long_read(&memcg->events[event]);
687 }
688 
689 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg,
690                                          struct page *page,
691                                          bool compound, int nr_pages)
692 {
693         /*
694          * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is
695          * counted as CACHE even if it's on ANON LRU.
696          */
697         if (PageAnon(page))
698                 __mod_memcg_state(memcg, MEMCG_RSS, nr_pages);
699         else {
700                 __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages);
701                 if (PageSwapBacked(page))
702                         __mod_memcg_state(memcg, NR_SHMEM, nr_pages);
703         }
704 
705         if (compound) {
706                 VM_BUG_ON_PAGE(!PageTransHuge(page), page);
707                 __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages);
708         }
709 
710         /* pagein of a big page is an event. So, ignore page size */
711         if (nr_pages > 0)
712                 __count_memcg_events(memcg, PGPGIN, 1);
713         else {
714                 __count_memcg_events(memcg, PGPGOUT, 1);
715                 nr_pages = -nr_pages; /* for event */
716         }
717 
718         __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages);
719 }
720 
721 unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg,
722                                            int nid, unsigned int lru_mask)
723 {
724         struct lruvec *lruvec = mem_cgroup_lruvec(NODE_DATA(nid), memcg);
725         unsigned long nr = 0;
726         enum lru_list lru;
727 
728         VM_BUG_ON((unsigned)nid >= nr_node_ids);
729 
730         for_each_lru(lru) {
731                 if (!(BIT(lru) & lru_mask))
732                         continue;
733                 nr += mem_cgroup_get_lru_size(lruvec, lru);
734         }
735         return nr;
736 }
737 
738 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg,
739                         unsigned int lru_mask)
740 {
741         unsigned long nr = 0;
742         int nid;
743 
744         for_each_node_state(nid, N_MEMORY)
745                 nr += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask);
746         return nr;
747 }
748 
749 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg,
750                                        enum mem_cgroup_events_target target)
751 {
752         unsigned long val, next;
753 
754         val = __this_cpu_read(memcg->stat_cpu->nr_page_events);
755         next = __this_cpu_read(memcg->stat_cpu->targets[target]);
756         /* from time_after() in jiffies.h */
757         if ((long)(next - val) < 0) {
758                 switch (target) {
759                 case MEM_CGROUP_TARGET_THRESH:
760                         next = val + THRESHOLDS_EVENTS_TARGET;
761                         break;
762                 case MEM_CGROUP_TARGET_SOFTLIMIT:
763                         next = val + SOFTLIMIT_EVENTS_TARGET;
764                         break;
765                 case MEM_CGROUP_TARGET_NUMAINFO:
766                         next = val + NUMAINFO_EVENTS_TARGET;
767                         break;
768                 default:
769                         break;
770                 }
771                 __this_cpu_write(memcg->stat_cpu->targets[target], next);
772                 return true;
773         }
774         return false;
775 }
776 
777 /*
778  * Check events in order.
779  *
780  */
781 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page)
782 {
783         /* threshold event is triggered in finer grain than soft limit */
784         if (unlikely(mem_cgroup_event_ratelimit(memcg,
785                                                 MEM_CGROUP_TARGET_THRESH))) {
786                 bool do_softlimit;
787                 bool do_numainfo __maybe_unused;
788 
789                 do_softlimit = mem_cgroup_event_ratelimit(memcg,
790                                                 MEM_CGROUP_TARGET_SOFTLIMIT);
791 #if MAX_NUMNODES > 1
792                 do_numainfo = mem_cgroup_event_ratelimit(memcg,
793                                                 MEM_CGROUP_TARGET_NUMAINFO);
794 #endif
795                 mem_cgroup_threshold(memcg);
796                 if (unlikely(do_softlimit))
797                         mem_cgroup_update_tree(memcg, page);
798 #if MAX_NUMNODES > 1
799                 if (unlikely(do_numainfo))
800                         atomic_inc(&memcg->numainfo_events);
801 #endif
802         }
803 }
804 
805 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p)
806 {
807         /*
808          * mm_update_next_owner() may clear mm->owner to NULL
809          * if it races with swapoff, page migration, etc.
810          * So this can be called with p == NULL.
811          */
812         if (unlikely(!p))
813                 return NULL;
814 
815         return mem_cgroup_from_css(task_css(p, memory_cgrp_id));
816 }
817 EXPORT_SYMBOL(mem_cgroup_from_task);
818 
819 /**
820  * get_mem_cgroup_from_mm: Obtain a reference on given mm_struct's memcg.
821  * @mm: mm from which memcg should be extracted. It can be NULL.
822  *
823  * Obtain a reference on mm->memcg and returns it if successful. Otherwise
824  * root_mem_cgroup is returned. However if mem_cgroup is disabled, NULL is
825  * returned.
826  */
827 struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
828 {
829         struct mem_cgroup *memcg;
830 
831         if (mem_cgroup_disabled())
832                 return NULL;
833 
834         rcu_read_lock();
835         do {
836                 /*
837                  * Page cache insertions can happen withou an
838                  * actual mm context, e.g. during disk probing
839                  * on boot, loopback IO, acct() writes etc.
840                  */
841                 if (unlikely(!mm))
842                         memcg = root_mem_cgroup;
843                 else {
844                         memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
845                         if (unlikely(!memcg))
846                                 memcg = root_mem_cgroup;
847                 }
848         } while (!css_tryget_online(&memcg->css));
849         rcu_read_unlock();
850         return memcg;
851 }
852 EXPORT_SYMBOL(get_mem_cgroup_from_mm);
853 
854 /**
855  * get_mem_cgroup_from_page: Obtain a reference on given page's memcg.
856  * @page: page from which memcg should be extracted.
857  *
858  * Obtain a reference on page->memcg and returns it if successful. Otherwise
859  * root_mem_cgroup is returned.
860  */
861 struct mem_cgroup *get_mem_cgroup_from_page(struct page *page)
862 {
863         struct mem_cgroup *memcg = page->mem_cgroup;
864 
865         if (mem_cgroup_disabled())
866                 return NULL;
867 
868         rcu_read_lock();
869         if (!memcg || !css_tryget_online(&memcg->css))
870                 memcg = root_mem_cgroup;
871         rcu_read_unlock();
872         return memcg;
873 }
874 EXPORT_SYMBOL(get_mem_cgroup_from_page);
875 
876 /**
877  * If current->active_memcg is non-NULL, do not fallback to current->mm->memcg.
878  */
879 static __always_inline struct mem_cgroup *get_mem_cgroup_from_current(void)
880 {
881         if (unlikely(current->active_memcg)) {
882                 struct mem_cgroup *memcg = root_mem_cgroup;
883 
884                 rcu_read_lock();
885                 if (css_tryget_online(&current->active_memcg->css))
886                         memcg = current->active_memcg;
887                 rcu_read_unlock();
888                 return memcg;
889         }
890         return get_mem_cgroup_from_mm(current->mm);
891 }
892 
893 /**
894  * mem_cgroup_iter - iterate over memory cgroup hierarchy
895  * @root: hierarchy root
896  * @prev: previously returned memcg, NULL on first invocation
897  * @reclaim: cookie for shared reclaim walks, NULL for full walks
898  *
899  * Returns references to children of the hierarchy below @root, or
900  * @root itself, or %NULL after a full round-trip.
901  *
902  * Caller must pass the return value in @prev on subsequent
903  * invocations for reference counting, or use mem_cgroup_iter_break()
904  * to cancel a hierarchy walk before the round-trip is complete.
905  *
906  * Reclaimers can specify a node and a priority level in @reclaim to
907  * divide up the memcgs in the hierarchy among all concurrent
908  * reclaimers operating on the same node and priority.
909  */
910 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root,
911                                    struct mem_cgroup *prev,
912                                    struct mem_cgroup_reclaim_cookie *reclaim)
913 {
914         struct mem_cgroup_reclaim_iter *uninitialized_var(iter);
915         struct cgroup_subsys_state *css = NULL;
916         struct mem_cgroup *memcg = NULL;
917         struct mem_cgroup *pos = NULL;
918 
919         if (mem_cgroup_disabled())
920                 return NULL;
921 
922         if (!root)
923                 root = root_mem_cgroup;
924 
925         if (prev && !reclaim)
926                 pos = prev;
927 
928         if (!root->use_hierarchy && root != root_mem_cgroup) {
929                 if (prev)
930                         goto out;
931                 return root;
932         }
933 
934         rcu_read_lock();
935 
936         if (reclaim) {
937                 struct mem_cgroup_per_node *mz;
938 
939                 mz = mem_cgroup_nodeinfo(root, reclaim->pgdat->node_id);
940                 iter = &mz->iter[reclaim->priority];
941 
942                 if (prev && reclaim->generation != iter->generation)
943                         goto out_unlock;
944 
945                 while (1) {
946                         pos = READ_ONCE(iter->position);
947                         if (!pos || css_tryget(&pos->css))
948                                 break;
949                         /*
950                          * css reference reached zero, so iter->position will
951                          * be cleared by ->css_released. However, we should not
952                          * rely on this happening soon, because ->css_released
953                          * is called from a work queue, and by busy-waiting we
954                          * might block it. So we clear iter->position right
955                          * away.
956                          */
957                         (void)cmpxchg(&iter->position, pos, NULL);
958                 }
959         }
960 
961         if (pos)
962                 css = &pos->css;
963 
964         for (;;) {
965                 css = css_next_descendant_pre(css, &root->css);
966                 if (!css) {
967                         /*
968                          * Reclaimers share the hierarchy walk, and a
969                          * new one might jump in right at the end of
970                          * the hierarchy - make sure they see at least
971                          * one group and restart from the beginning.
972                          */
973                         if (!prev)
974                                 continue;
975                         break;
976                 }
977 
978                 /*
979                  * Verify the css and acquire a reference.  The root
980                  * is provided by the caller, so we know it's alive
981                  * and kicking, and don't take an extra reference.
982                  */
983                 memcg = mem_cgroup_from_css(css);
984 
985                 if (css == &root->css)
986                         break;
987 
988                 if (css_tryget(css))
989                         break;
990 
991                 memcg = NULL;
992         }
993 
994         if (reclaim) {
995                 /*
996                  * The position could have already been updated by a competing
997                  * thread, so check that the value hasn't changed since we read
998                  * it to avoid reclaiming from the same cgroup twice.
999                  */
1000                 (void)cmpxchg(&iter->position, pos, memcg);
1001 
1002                 if (pos)
1003                         css_put(&pos->css);
1004 
1005                 if (!memcg)
1006                         iter->generation++;
1007                 else if (!prev)
1008                         reclaim->generation = iter->generation;
1009         }
1010 
1011 out_unlock:
1012         rcu_read_unlock();
1013 out:
1014         if (prev && prev != root)
1015                 css_put(&prev->css);
1016 
1017         return memcg;
1018 }
1019 
1020 /**
1021  * mem_cgroup_iter_break - abort a hierarchy walk prematurely
1022  * @root: hierarchy root
1023  * @prev: last visited hierarchy member as returned by mem_cgroup_iter()
1024  */
1025 void mem_cgroup_iter_break(struct mem_cgroup *root,
1026                            struct mem_cgroup *prev)
1027 {
1028         if (!root)
1029                 root = root_mem_cgroup;
1030         if (prev && prev != root)
1031                 css_put(&prev->css);
1032 }
1033 
1034 static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
1035 {
1036         struct mem_cgroup *memcg = dead_memcg;
1037         struct mem_cgroup_reclaim_iter *iter;
1038         struct mem_cgroup_per_node *mz;
1039         int nid;
1040         int i;
1041 
1042         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1043                 for_each_node(nid) {
1044                         mz = mem_cgroup_nodeinfo(memcg, nid);
1045                         for (i = 0; i <= DEF_PRIORITY; i++) {
1046                                 iter = &mz->iter[i];
1047                                 cmpxchg(&iter->position,
1048                                         dead_memcg, NULL);
1049                         }
1050                 }
1051         }
1052 }
1053 
1054 /**
1055  * mem_cgroup_scan_tasks - iterate over tasks of a memory cgroup hierarchy
1056  * @memcg: hierarchy root
1057  * @fn: function to call for each task
1058  * @arg: argument passed to @fn
1059  *
1060  * This function iterates over tasks attached to @memcg or to any of its
1061  * descendants and calls @fn for each task. If @fn returns a non-zero
1062  * value, the function breaks the iteration loop and returns the value.
1063  * Otherwise, it will iterate over all tasks and return 0.
1064  *
1065  * This function must not be called for the root memory cgroup.
1066  */
1067 int mem_cgroup_scan_tasks(struct mem_cgroup *memcg,
1068                           int (*fn)(struct task_struct *, void *), void *arg)
1069 {
1070         struct mem_cgroup *iter;
1071         int ret = 0;
1072 
1073         BUG_ON(memcg == root_mem_cgroup);
1074 
1075         for_each_mem_cgroup_tree(iter, memcg) {
1076                 struct css_task_iter it;
1077                 struct task_struct *task;
1078 
1079                 css_task_iter_start(&iter->css, 0, &it);
1080                 while (!ret && (task = css_task_iter_next(&it)))
1081                         ret = fn(task, arg);
1082                 css_task_iter_end(&it);
1083                 if (ret) {
1084                         mem_cgroup_iter_break(memcg, iter);
1085                         break;
1086                 }
1087         }
1088         return ret;
1089 }
1090 
1091 /**
1092  * mem_cgroup_page_lruvec - return lruvec for isolating/putting an LRU page
1093  * @page: the page
1094  * @pgdat: pgdat of the page
1095  *
1096  * This function is only safe when following the LRU page isolation
1097  * and putback protocol: the LRU lock must be held, and the page must
1098  * either be PageLRU() or the caller must have isolated/allocated it.
1099  */
1100 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct pglist_data *pgdat)
1101 {
1102         struct mem_cgroup_per_node *mz;
1103         struct mem_cgroup *memcg;
1104         struct lruvec *lruvec;
1105 
1106         if (mem_cgroup_disabled()) {
1107                 lruvec = &pgdat->lruvec;
1108                 goto out;
1109         }
1110 
1111         memcg = page->mem_cgroup;
1112         /*
1113          * Swapcache readahead pages are added to the LRU - and
1114          * possibly migrated - before they are charged.
1115          */
1116         if (!memcg)
1117                 memcg = root_mem_cgroup;
1118 
1119         mz = mem_cgroup_page_nodeinfo(memcg, page);
1120         lruvec = &mz->lruvec;
1121 out:
1122         /*
1123          * Since a node can be onlined after the mem_cgroup was created,
1124          * we have to be prepared to initialize lruvec->zone here;
1125          * and if offlined then reonlined, we need to reinitialize it.
1126          */
1127         if (unlikely(lruvec->pgdat != pgdat))
1128                 lruvec->pgdat = pgdat;
1129         return lruvec;
1130 }
1131 
1132 /**
1133  * mem_cgroup_update_lru_size - account for adding or removing an lru page
1134  * @lruvec: mem_cgroup per zone lru vector
1135  * @lru: index of lru list the page is sitting on
1136  * @zid: zone id of the accounted pages
1137  * @nr_pages: positive when adding or negative when removing
1138  *
1139  * This function must be called under lru_lock, just before a page is added
1140  * to or just after a page is removed from an lru list (that ordering being
1141  * so as to allow it to check that lru_size 0 is consistent with list_empty).
1142  */
1143 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru,
1144                                 int zid, int nr_pages)
1145 {
1146         struct mem_cgroup_per_node *mz;
1147         unsigned long *lru_size;
1148         long size;
1149 
1150         if (mem_cgroup_disabled())
1151                 return;
1152 
1153         mz = container_of(lruvec, struct mem_cgroup_per_node, lruvec);
1154         lru_size = &mz->lru_zone_size[zid][lru];
1155 
1156         if (nr_pages < 0)
1157                 *lru_size += nr_pages;
1158 
1159         size = *lru_size;
1160         if (WARN_ONCE(size < 0,
1161                 "%s(%p, %d, %d): lru_size %ld\n",
1162                 __func__, lruvec, lru, nr_pages, size)) {
1163                 VM_BUG_ON(1);
1164                 *lru_size = 0;
1165         }
1166 
1167         if (nr_pages > 0)
1168                 *lru_size += nr_pages;
1169 }
1170 
1171 bool task_in_mem_cgroup(struct task_struct *task, struct mem_cgroup *memcg)
1172 {
1173         struct mem_cgroup *task_memcg;
1174         struct task_struct *p;
1175         bool ret;
1176 
1177         p = find_lock_task_mm(task);
1178         if (p) {
1179                 task_memcg = get_mem_cgroup_from_mm(p->mm);
1180                 task_unlock(p);
1181         } else {
1182                 /*
1183                  * All threads may have already detached their mm's, but the oom
1184                  * killer still needs to detect if they have already been oom
1185                  * killed to prevent needlessly killing additional tasks.
1186                  */
1187                 rcu_read_lock();
1188                 task_memcg = mem_cgroup_from_task(task);
1189                 css_get(&task_memcg->css);
1190                 rcu_read_unlock();
1191         }
1192         ret = mem_cgroup_is_descendant(task_memcg, memcg);
1193         css_put(&task_memcg->css);
1194         return ret;
1195 }
1196 
1197 /**
1198  * mem_cgroup_margin - calculate chargeable space of a memory cgroup
1199  * @memcg: the memory cgroup
1200  *
1201  * Returns the maximum amount of memory @mem can be charged with, in
1202  * pages.
1203  */
1204 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg)
1205 {
1206         unsigned long margin = 0;
1207         unsigned long count;
1208         unsigned long limit;
1209 
1210         count = page_counter_read(&memcg->memory);
1211         limit = READ_ONCE(memcg->memory.max);
1212         if (count < limit)
1213                 margin = limit - count;
1214 
1215         if (do_memsw_account()) {
1216                 count = page_counter_read(&memcg->memsw);
1217                 limit = READ_ONCE(memcg->memsw.max);
1218                 if (count <= limit)
1219                         margin = min(margin, limit - count);
1220                 else
1221                         margin = 0;
1222         }
1223 
1224         return margin;
1225 }
1226 
1227 /*
1228  * A routine for checking "mem" is under move_account() or not.
1229  *
1230  * Checking a cgroup is mc.from or mc.to or under hierarchy of
1231  * moving cgroups. This is for waiting at high-memory pressure
1232  * caused by "move".
1233  */
1234 static bool mem_cgroup_under_move(struct mem_cgroup *memcg)
1235 {
1236         struct mem_cgroup *from;
1237         struct mem_cgroup *to;
1238         bool ret = false;
1239         /*
1240          * Unlike task_move routines, we access mc.to, mc.from not under
1241          * mutual exclusion by cgroup_mutex. Here, we take spinlock instead.
1242          */
1243         spin_lock(&mc.lock);
1244         from = mc.from;
1245         to = mc.to;
1246         if (!from)
1247                 goto unlock;
1248 
1249         ret = mem_cgroup_is_descendant(from, memcg) ||
1250                 mem_cgroup_is_descendant(to, memcg);
1251 unlock:
1252         spin_unlock(&mc.lock);
1253         return ret;
1254 }
1255 
1256 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg)
1257 {
1258         if (mc.moving_task && current != mc.moving_task) {
1259                 if (mem_cgroup_under_move(memcg)) {
1260                         DEFINE_WAIT(wait);
1261                         prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE);
1262                         /* moving charge context might have finished. */
1263                         if (mc.moving_task)
1264                                 schedule();
1265                         finish_wait(&mc.waitq, &wait);
1266                         return true;
1267                 }
1268         }
1269         return false;
1270 }
1271 
1272 static const unsigned int memcg1_stats[] = {
1273         MEMCG_CACHE,
1274         MEMCG_RSS,
1275         MEMCG_RSS_HUGE,
1276         NR_SHMEM,
1277         NR_FILE_MAPPED,
1278         NR_FILE_DIRTY,
1279         NR_WRITEBACK,
1280         MEMCG_SWAP,
1281 };
1282 
1283 static const char *const memcg1_stat_names[] = {
1284         "cache",
1285         "rss",
1286         "rss_huge",
1287         "shmem",
1288         "mapped_file",
1289         "dirty",
1290         "writeback",
1291         "swap",
1292 };
1293 
1294 #define K(x) ((x) << (PAGE_SHIFT-10))
1295 /**
1296  * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller.
1297  * @memcg: The memory cgroup that went over limit
1298  * @p: Task that is going to be killed
1299  *
1300  * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is
1301  * enabled
1302  */
1303 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p)
1304 {
1305         struct mem_cgroup *iter;
1306         unsigned int i;
1307 
1308         rcu_read_lock();
1309 
1310         if (p) {
1311                 pr_info("Task in ");
1312                 pr_cont_cgroup_path(task_cgroup(p, memory_cgrp_id));
1313                 pr_cont(" killed as a result of limit of ");
1314         } else {
1315                 pr_info("Memory limit reached of cgroup ");
1316         }
1317 
1318         pr_cont_cgroup_path(memcg->css.cgroup);
1319         pr_cont("\n");
1320 
1321         rcu_read_unlock();
1322 
1323         pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n",
1324                 K((u64)page_counter_read(&memcg->memory)),
1325                 K((u64)memcg->memory.max), memcg->memory.failcnt);
1326         pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n",
1327                 K((u64)page_counter_read(&memcg->memsw)),
1328                 K((u64)memcg->memsw.max), memcg->memsw.failcnt);
1329         pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n",
1330                 K((u64)page_counter_read(&memcg->kmem)),
1331                 K((u64)memcg->kmem.max), memcg->kmem.failcnt);
1332 
1333         for_each_mem_cgroup_tree(iter, memcg) {
1334                 pr_info("Memory cgroup stats for ");
1335                 pr_cont_cgroup_path(iter->css.cgroup);
1336                 pr_cont(":");
1337 
1338                 for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
1339                         if (memcg1_stats[i] == MEMCG_SWAP && !do_swap_account)
1340                                 continue;
1341                         pr_cont(" %s:%luKB", memcg1_stat_names[i],
1342                                 K(memcg_page_state(iter, memcg1_stats[i])));
1343                 }
1344 
1345                 for (i = 0; i < NR_LRU_LISTS; i++)
1346                         pr_cont(" %s:%luKB", mem_cgroup_lru_names[i],
1347                                 K(mem_cgroup_nr_lru_pages(iter, BIT(i))));
1348 
1349                 pr_cont("\n");
1350         }
1351 }
1352 
1353 /*
1354  * Return the memory (and swap, if configured) limit for a memcg.
1355  */
1356 unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg)
1357 {
1358         unsigned long max;
1359 
1360         max = memcg->memory.max;
1361         if (mem_cgroup_swappiness(memcg)) {
1362                 unsigned long memsw_max;
1363                 unsigned long swap_max;
1364 
1365                 memsw_max = memcg->memsw.max;
1366                 swap_max = memcg->swap.max;
1367                 swap_max = min(swap_max, (unsigned long)total_swap_pages);
1368                 max = min(max + swap_max, memsw_max);
1369         }
1370         return max;
1371 }
1372 
1373 static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask,
1374                                      int order)
1375 {
1376         struct oom_control oc = {
1377                 .zonelist = NULL,
1378                 .nodemask = NULL,
1379                 .memcg = memcg,
1380                 .gfp_mask = gfp_mask,
1381                 .order = order,
1382         };
1383         bool ret;
1384 
1385         mutex_lock(&oom_lock);
1386         ret = out_of_memory(&oc);
1387         mutex_unlock(&oom_lock);
1388         return ret;
1389 }
1390 
1391 #if MAX_NUMNODES > 1
1392 
1393 /**
1394  * test_mem_cgroup_node_reclaimable
1395  * @memcg: the target memcg
1396  * @nid: the node ID to be checked.
1397  * @noswap : specify true here if the user wants flle only information.
1398  *
1399  * This function returns whether the specified memcg contains any
1400  * reclaimable pages on a node. Returns true if there are any reclaimable
1401  * pages in the node.
1402  */
1403 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg,
1404                 int nid, bool noswap)
1405 {
1406         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE))
1407                 return true;
1408         if (noswap || !total_swap_pages)
1409                 return false;
1410         if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON))
1411                 return true;
1412         return false;
1413 
1414 }
1415 
1416 /*
1417  * Always updating the nodemask is not very good - even if we have an empty
1418  * list or the wrong list here, we can start from some node and traverse all
1419  * nodes based on the zonelist. So update the list loosely once per 10 secs.
1420  *
1421  */
1422 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg)
1423 {
1424         int nid;
1425         /*
1426          * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET
1427          * pagein/pageout changes since the last update.
1428          */
1429         if (!atomic_read(&memcg->numainfo_events))
1430                 return;
1431         if (atomic_inc_return(&memcg->numainfo_updating) > 1)
1432                 return;
1433 
1434         /* make a nodemask where this memcg uses memory from */
1435         memcg->scan_nodes = node_states[N_MEMORY];
1436 
1437         for_each_node_mask(nid, node_states[N_MEMORY]) {
1438 
1439                 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false))
1440                         node_clear(nid, memcg->scan_nodes);
1441         }
1442 
1443         atomic_set(&memcg->numainfo_events, 0);
1444         atomic_set(&memcg->numainfo_updating, 0);
1445 }
1446 
1447 /*
1448  * Selecting a node where we start reclaim from. Because what we need is just
1449  * reducing usage counter, start from anywhere is O,K. Considering
1450  * memory reclaim from current node, there are pros. and cons.
1451  *
1452  * Freeing memory from current node means freeing memory from a node which
1453  * we'll use or we've used. So, it may make LRU bad. And if several threads
1454  * hit limits, it will see a contention on a node. But freeing from remote
1455  * node means more costs for memory reclaim because of memory latency.
1456  *
1457  * Now, we use round-robin. Better algorithm is welcomed.
1458  */
1459 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1460 {
1461         int node;
1462 
1463         mem_cgroup_may_update_nodemask(memcg);
1464         node = memcg->last_scanned_node;
1465 
1466         node = next_node_in(node, memcg->scan_nodes);
1467         /*
1468          * mem_cgroup_may_update_nodemask might have seen no reclaimmable pages
1469          * last time it really checked all the LRUs due to rate limiting.
1470          * Fallback to the current node in that case for simplicity.
1471          */
1472         if (unlikely(node == MAX_NUMNODES))
1473                 node = numa_node_id();
1474 
1475         memcg->last_scanned_node = node;
1476         return node;
1477 }
1478 #else
1479 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg)
1480 {
1481         return 0;
1482 }
1483 #endif
1484 
1485 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg,
1486                                    pg_data_t *pgdat,
1487                                    gfp_t gfp_mask,
1488                                    unsigned long *total_scanned)
1489 {
1490         struct mem_cgroup *victim = NULL;
1491         int total = 0;
1492         int loop = 0;
1493         unsigned long excess;
1494         unsigned long nr_scanned;
1495         struct mem_cgroup_reclaim_cookie reclaim = {
1496                 .pgdat = pgdat,
1497                 .priority = 0,
1498         };
1499 
1500         excess = soft_limit_excess(root_memcg);
1501 
1502         while (1) {
1503                 victim = mem_cgroup_iter(root_memcg, victim, &reclaim);
1504                 if (!victim) {
1505                         loop++;
1506                         if (loop >= 2) {
1507                                 /*
1508                                  * If we have not been able to reclaim
1509                                  * anything, it might because there are
1510                                  * no reclaimable pages under this hierarchy
1511                                  */
1512                                 if (!total)
1513                                         break;
1514                                 /*
1515                                  * We want to do more targeted reclaim.
1516                                  * excess >> 2 is not to excessive so as to
1517                                  * reclaim too much, nor too less that we keep
1518                                  * coming back to reclaim from this cgroup
1519                                  */
1520                                 if (total >= (excess >> 2) ||
1521                                         (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS))
1522                                         break;
1523                         }
1524                         continue;
1525                 }
1526                 total += mem_cgroup_shrink_node(victim, gfp_mask, false,
1527                                         pgdat, &nr_scanned);
1528                 *total_scanned += nr_scanned;
1529                 if (!soft_limit_excess(root_memcg))
1530                         break;
1531         }
1532         mem_cgroup_iter_break(root_memcg, victim);
1533         return total;
1534 }
1535 
1536 #ifdef CONFIG_LOCKDEP
1537 static struct lockdep_map memcg_oom_lock_dep_map = {
1538         .name = "memcg_oom_lock",
1539 };
1540 #endif
1541 
1542 static DEFINE_SPINLOCK(memcg_oom_lock);
1543 
1544 /*
1545  * Check OOM-Killer is already running under our hierarchy.
1546  * If someone is running, return false.
1547  */
1548 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg)
1549 {
1550         struct mem_cgroup *iter, *failed = NULL;
1551 
1552         spin_lock(&memcg_oom_lock);
1553 
1554         for_each_mem_cgroup_tree(iter, memcg) {
1555                 if (iter->oom_lock) {
1556                         /*
1557                          * this subtree of our hierarchy is already locked
1558                          * so we cannot give a lock.
1559                          */
1560                         failed = iter;
1561                         mem_cgroup_iter_break(memcg, iter);
1562                         break;
1563                 } else
1564                         iter->oom_lock = true;
1565         }
1566 
1567         if (failed) {
1568                 /*
1569                  * OK, we failed to lock the whole subtree so we have
1570                  * to clean up what we set up to the failing subtree
1571                  */
1572                 for_each_mem_cgroup_tree(iter, memcg) {
1573                         if (iter == failed) {
1574                                 mem_cgroup_iter_break(memcg, iter);
1575                                 break;
1576                         }
1577                         iter->oom_lock = false;
1578                 }
1579         } else
1580                 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_);
1581 
1582         spin_unlock(&memcg_oom_lock);
1583 
1584         return !failed;
1585 }
1586 
1587 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg)
1588 {
1589         struct mem_cgroup *iter;
1590 
1591         spin_lock(&memcg_oom_lock);
1592         mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_);
1593         for_each_mem_cgroup_tree(iter, memcg)
1594                 iter->oom_lock = false;
1595         spin_unlock(&memcg_oom_lock);
1596 }
1597 
1598 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg)
1599 {
1600         struct mem_cgroup *iter;
1601 
1602         spin_lock(&memcg_oom_lock);
1603         for_each_mem_cgroup_tree(iter, memcg)
1604                 iter->under_oom++;
1605         spin_unlock(&memcg_oom_lock);
1606 }
1607 
1608 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg)
1609 {
1610         struct mem_cgroup *iter;
1611 
1612         /*
1613          * When a new child is created while the hierarchy is under oom,
1614          * mem_cgroup_oom_lock() may not be called. Watch for underflow.
1615          */
1616         spin_lock(&memcg_oom_lock);
1617         for_each_mem_cgroup_tree(iter, memcg)
1618                 if (iter->under_oom > 0)
1619                         iter->under_oom--;
1620         spin_unlock(&memcg_oom_lock);
1621 }
1622 
1623 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq);
1624 
1625 struct oom_wait_info {
1626         struct mem_cgroup *memcg;
1627         wait_queue_entry_t      wait;
1628 };
1629 
1630 static int memcg_oom_wake_function(wait_queue_entry_t *wait,
1631         unsigned mode, int sync, void *arg)
1632 {
1633         struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg;
1634         struct mem_cgroup *oom_wait_memcg;
1635         struct oom_wait_info *oom_wait_info;
1636 
1637         oom_wait_info = container_of(wait, struct oom_wait_info, wait);
1638         oom_wait_memcg = oom_wait_info->memcg;
1639 
1640         if (!mem_cgroup_is_descendant(wake_memcg, oom_wait_memcg) &&
1641             !mem_cgroup_is_descendant(oom_wait_memcg, wake_memcg))
1642                 return 0;
1643         return autoremove_wake_function(wait, mode, sync, arg);
1644 }
1645 
1646 static void memcg_oom_recover(struct mem_cgroup *memcg)
1647 {
1648         /*
1649          * For the following lockless ->under_oom test, the only required
1650          * guarantee is that it must see the state asserted by an OOM when
1651          * this function is called as a result of userland actions
1652          * triggered by the notification of the OOM.  This is trivially
1653          * achieved by invoking mem_cgroup_mark_under_oom() before
1654          * triggering notification.
1655          */
1656         if (memcg && memcg->under_oom)
1657                 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg);
1658 }
1659 
1660 enum oom_status {
1661         OOM_SUCCESS,
1662         OOM_FAILED,
1663         OOM_ASYNC,
1664         OOM_SKIPPED
1665 };
1666 
1667 static enum oom_status mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order)
1668 {
1669         enum oom_status ret;
1670         bool locked;
1671 
1672         if (order > PAGE_ALLOC_COSTLY_ORDER)
1673                 return OOM_SKIPPED;
1674 
1675         memcg_memory_event(memcg, MEMCG_OOM);
1676 
1677         /*
1678          * We are in the middle of the charge context here, so we
1679          * don't want to block when potentially sitting on a callstack
1680          * that holds all kinds of filesystem and mm locks.
1681          *
1682          * cgroup1 allows disabling the OOM killer and waiting for outside
1683          * handling until the charge can succeed; remember the context and put
1684          * the task to sleep at the end of the page fault when all locks are
1685          * released.
1686          *
1687          * On the other hand, in-kernel OOM killer allows for an async victim
1688          * memory reclaim (oom_reaper) and that means that we are not solely
1689          * relying on the oom victim to make a forward progress and we can
1690          * invoke the oom killer here.
1691          *
1692          * Please note that mem_cgroup_out_of_memory might fail to find a
1693          * victim and then we have to bail out from the charge path.
1694          */
1695         if (memcg->oom_kill_disable) {
1696                 if (!current->in_user_fault)
1697                         return OOM_SKIPPED;
1698                 css_get(&memcg->css);
1699                 current->memcg_in_oom = memcg;
1700                 current->memcg_oom_gfp_mask = mask;
1701                 current->memcg_oom_order = order;
1702 
1703                 return OOM_ASYNC;
1704         }
1705 
1706         mem_cgroup_mark_under_oom(memcg);
1707 
1708         locked = mem_cgroup_oom_trylock(memcg);
1709 
1710         if (locked)
1711                 mem_cgroup_oom_notify(memcg);
1712 
1713         mem_cgroup_unmark_under_oom(memcg);
1714         if (mem_cgroup_out_of_memory(memcg, mask, order))
1715                 ret = OOM_SUCCESS;
1716         else
1717                 ret = OOM_FAILED;
1718 
1719         if (locked)
1720                 mem_cgroup_oom_unlock(memcg);
1721 
1722         return ret;
1723 }
1724 
1725 /**
1726  * mem_cgroup_oom_synchronize - complete memcg OOM handling
1727  * @handle: actually kill/wait or just clean up the OOM state
1728  *
1729  * This has to be called at the end of a page fault if the memcg OOM
1730  * handler was enabled.
1731  *
1732  * Memcg supports userspace OOM handling where failed allocations must
1733  * sleep on a waitqueue until the userspace task resolves the
1734  * situation.  Sleeping directly in the charge context with all kinds
1735  * of locks held is not a good idea, instead we remember an OOM state
1736  * in the task and mem_cgroup_oom_synchronize() has to be called at
1737  * the end of the page fault to complete the OOM handling.
1738  *
1739  * Returns %true if an ongoing memcg OOM situation was detected and
1740  * completed, %false otherwise.
1741  */
1742 bool mem_cgroup_oom_synchronize(bool handle)
1743 {
1744         struct mem_cgroup *memcg = current->memcg_in_oom;
1745         struct oom_wait_info owait;
1746         bool locked;
1747 
1748         /* OOM is global, do not handle */
1749         if (!memcg)
1750                 return false;
1751 
1752         if (!handle)
1753                 goto cleanup;
1754 
1755         owait.memcg = memcg;
1756         owait.wait.flags = 0;
1757         owait.wait.func = memcg_oom_wake_function;
1758         owait.wait.private = current;
1759         INIT_LIST_HEAD(&owait.wait.entry);
1760 
1761         prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE);
1762         mem_cgroup_mark_under_oom(memcg);
1763 
1764         locked = mem_cgroup_oom_trylock(memcg);
1765 
1766         if (locked)
1767                 mem_cgroup_oom_notify(memcg);
1768 
1769         if (locked && !memcg->oom_kill_disable) {
1770                 mem_cgroup_unmark_under_oom(memcg);
1771                 finish_wait(&memcg_oom_waitq, &owait.wait);
1772                 mem_cgroup_out_of_memory(memcg, current->memcg_oom_gfp_mask,
1773                                          current->memcg_oom_order);
1774         } else {
1775                 schedule();
1776                 mem_cgroup_unmark_under_oom(memcg);
1777                 finish_wait(&memcg_oom_waitq, &owait.wait);
1778         }
1779 
1780         if (locked) {
1781                 mem_cgroup_oom_unlock(memcg);
1782                 /*
1783                  * There is no guarantee that an OOM-lock contender
1784                  * sees the wakeups triggered by the OOM kill
1785                  * uncharges.  Wake any sleepers explicitely.
1786                  */
1787                 memcg_oom_recover(memcg);
1788         }
1789 cleanup:
1790         current->memcg_in_oom = NULL;
1791         css_put(&memcg->css);
1792         return true;
1793 }
1794 
1795 /**
1796  * mem_cgroup_get_oom_group - get a memory cgroup to clean up after OOM
1797  * @victim: task to be killed by the OOM killer
1798  * @oom_domain: memcg in case of memcg OOM, NULL in case of system-wide OOM
1799  *
1800  * Returns a pointer to a memory cgroup, which has to be cleaned up
1801  * by killing all belonging OOM-killable tasks.
1802  *
1803  * Caller has to call mem_cgroup_put() on the returned non-NULL memcg.
1804  */
1805 struct mem_cgroup *mem_cgroup_get_oom_group(struct task_struct *victim,
1806                                             struct mem_cgroup *oom_domain)
1807 {
1808         struct mem_cgroup *oom_group = NULL;
1809         struct mem_cgroup *memcg;
1810 
1811         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
1812                 return NULL;
1813 
1814         if (!oom_domain)
1815                 oom_domain = root_mem_cgroup;
1816 
1817         rcu_read_lock();
1818 
1819         memcg = mem_cgroup_from_task(victim);
1820         if (memcg == root_mem_cgroup)
1821                 goto out;
1822 
1823         /*
1824          * Traverse the memory cgroup hierarchy from the victim task's
1825          * cgroup up to the OOMing cgroup (or root) to find the
1826          * highest-level memory cgroup with oom.group set.
1827          */
1828         for (; memcg; memcg = parent_mem_cgroup(memcg)) {
1829                 if (memcg->oom_group)
1830                         oom_group = memcg;
1831 
1832                 if (memcg == oom_domain)
1833                         break;
1834         }
1835 
1836         if (oom_group)
1837                 css_get(&oom_group->css);
1838 out:
1839         rcu_read_unlock();
1840 
1841         return oom_group;
1842 }
1843 
1844 void mem_cgroup_print_oom_group(struct mem_cgroup *memcg)
1845 {
1846         pr_info("Tasks in ");
1847         pr_cont_cgroup_path(memcg->css.cgroup);
1848         pr_cont(" are going to be killed due to memory.oom.group set\n");
1849 }
1850 
1851 /**
1852  * lock_page_memcg - lock a page->mem_cgroup binding
1853  * @page: the page
1854  *
1855  * This function protects unlocked LRU pages from being moved to
1856  * another cgroup.
1857  *
1858  * It ensures lifetime of the returned memcg. Caller is responsible
1859  * for the lifetime of the page; __unlock_page_memcg() is available
1860  * when @page might get freed inside the locked section.
1861  */
1862 struct mem_cgroup *lock_page_memcg(struct page *page)
1863 {
1864         struct mem_cgroup *memcg;
1865         unsigned long flags;
1866 
1867         /*
1868          * The RCU lock is held throughout the transaction.  The fast
1869          * path can get away without acquiring the memcg->move_lock
1870          * because page moving starts with an RCU grace period.
1871          *
1872          * The RCU lock also protects the memcg from being freed when
1873          * the page state that is going to change is the only thing
1874          * preventing the page itself from being freed. E.g. writeback
1875          * doesn't hold a page reference and relies on PG_writeback to
1876          * keep off truncation, migration and so forth.
1877          */
1878         rcu_read_lock();
1879 
1880         if (mem_cgroup_disabled())
1881                 return NULL;
1882 again:
1883         memcg = page->mem_cgroup;
1884         if (unlikely(!memcg))
1885                 return NULL;
1886 
1887         if (atomic_read(&memcg->moving_account) <= 0)
1888                 return memcg;
1889 
1890         spin_lock_irqsave(&memcg->move_lock, flags);
1891         if (memcg != page->mem_cgroup) {
1892                 spin_unlock_irqrestore(&memcg->move_lock, flags);
1893                 goto again;
1894         }
1895 
1896         /*
1897          * When charge migration first begins, we can have locked and
1898          * unlocked page stat updates happening concurrently.  Track
1899          * the task who has the lock for unlock_page_memcg().
1900          */
1901         memcg->move_lock_task = current;
1902         memcg->move_lock_flags = flags;
1903 
1904         return memcg;
1905 }
1906 EXPORT_SYMBOL(lock_page_memcg);
1907 
1908 /**
1909  * __unlock_page_memcg - unlock and unpin a memcg
1910  * @memcg: the memcg
1911  *
1912  * Unlock and unpin a memcg returned by lock_page_memcg().
1913  */
1914 void __unlock_page_memcg(struct mem_cgroup *memcg)
1915 {
1916         if (memcg && memcg->move_lock_task == current) {
1917                 unsigned long flags = memcg->move_lock_flags;
1918 
1919                 memcg->move_lock_task = NULL;
1920                 memcg->move_lock_flags = 0;
1921 
1922                 spin_unlock_irqrestore(&memcg->move_lock, flags);
1923         }
1924 
1925         rcu_read_unlock();
1926 }
1927 
1928 /**
1929  * unlock_page_memcg - unlock a page->mem_cgroup binding
1930  * @page: the page
1931  */
1932 void unlock_page_memcg(struct page *page)
1933 {
1934         __unlock_page_memcg(page->mem_cgroup);
1935 }
1936 EXPORT_SYMBOL(unlock_page_memcg);
1937 
1938 struct memcg_stock_pcp {
1939         struct mem_cgroup *cached; /* this never be root cgroup */
1940         unsigned int nr_pages;
1941         struct work_struct work;
1942         unsigned long flags;
1943 #define FLUSHING_CACHED_CHARGE  0
1944 };
1945 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
1946 static DEFINE_MUTEX(percpu_charge_mutex);
1947 
1948 /**
1949  * consume_stock: Try to consume stocked charge on this cpu.
1950  * @memcg: memcg to consume from.
1951  * @nr_pages: how many pages to charge.
1952  *
1953  * The charges will only happen if @memcg matches the current cpu's memcg
1954  * stock, and at least @nr_pages are available in that stock.  Failure to
1955  * service an allocation will refill the stock.
1956  *
1957  * returns true if successful, false otherwise.
1958  */
1959 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
1960 {
1961         struct memcg_stock_pcp *stock;
1962         unsigned long flags;
1963         bool ret = false;
1964 
1965         if (nr_pages > MEMCG_CHARGE_BATCH)
1966                 return ret;
1967 
1968         local_irq_save(flags);
1969 
1970         stock = this_cpu_ptr(&memcg_stock);
1971         if (memcg == stock->cached && stock->nr_pages >= nr_pages) {
1972                 stock->nr_pages -= nr_pages;
1973                 ret = true;
1974         }
1975 
1976         local_irq_restore(flags);
1977 
1978         return ret;
1979 }
1980 
1981 /*
1982  * Returns stocks cached in percpu and reset cached information.
1983  */
1984 static void drain_stock(struct memcg_stock_pcp *stock)
1985 {
1986         struct mem_cgroup *old = stock->cached;
1987 
1988         if (stock->nr_pages) {
1989                 page_counter_uncharge(&old->memory, stock->nr_pages);
1990                 if (do_memsw_account())
1991                         page_counter_uncharge(&old->memsw, stock->nr_pages);
1992                 css_put_many(&old->css, stock->nr_pages);
1993                 stock->nr_pages = 0;
1994         }
1995         stock->cached = NULL;
1996 }
1997 
1998 static void drain_local_stock(struct work_struct *dummy)
1999 {
2000         struct memcg_stock_pcp *stock;
2001         unsigned long flags;
2002 
2003         /*
2004          * The only protection from memory hotplug vs. drain_stock races is
2005          * that we always operate on local CPU stock here with IRQ disabled
2006          */
2007         local_irq_save(flags);
2008 
2009         stock = this_cpu_ptr(&memcg_stock);
2010         drain_stock(stock);
2011         clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
2012 
2013         local_irq_restore(flags);
2014 }
2015 
2016 /*
2017  * Cache charges(val) to local per_cpu area.
2018  * This will be consumed by consume_stock() function, later.
2019  */
2020 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages)
2021 {
2022         struct memcg_stock_pcp *stock;
2023         unsigned long flags;
2024 
2025         local_irq_save(flags);
2026 
2027         stock = this_cpu_ptr(&memcg_stock);
2028         if (stock->cached != memcg) { /* reset if necessary */
2029                 drain_stock(stock);
2030                 stock->cached = memcg;
2031         }
2032         stock->nr_pages += nr_pages;
2033 
2034         if (stock->nr_pages > MEMCG_CHARGE_BATCH)
2035                 drain_stock(stock);
2036 
2037         local_irq_restore(flags);
2038 }
2039 
2040 /*
2041  * Drains all per-CPU charge caches for given root_memcg resp. subtree
2042  * of the hierarchy under it.
2043  */
2044 static void drain_all_stock(struct mem_cgroup *root_memcg)
2045 {
2046         int cpu, curcpu;
2047 
2048         /* If someone's already draining, avoid adding running more workers. */
2049         if (!mutex_trylock(&percpu_charge_mutex))
2050                 return;
2051         /*
2052          * Notify other cpus that system-wide "drain" is running
2053          * We do not care about races with the cpu hotplug because cpu down
2054          * as well as workers from this path always operate on the local
2055          * per-cpu data. CPU up doesn't touch memcg_stock at all.
2056          */
2057         curcpu = get_cpu();
2058         for_each_online_cpu(cpu) {
2059                 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu);
2060                 struct mem_cgroup *memcg;
2061 
2062                 memcg = stock->cached;
2063                 if (!memcg || !stock->nr_pages || !css_tryget(&memcg->css))
2064                         continue;
2065                 if (!mem_cgroup_is_descendant(memcg, root_memcg)) {
2066                         css_put(&memcg->css);
2067                         continue;
2068                 }
2069                 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) {
2070                         if (cpu == curcpu)
2071                                 drain_local_stock(&stock->work);
2072                         else
2073                                 schedule_work_on(cpu, &stock->work);
2074                 }
2075                 css_put(&memcg->css);
2076         }
2077         put_cpu();
2078         mutex_unlock(&percpu_charge_mutex);
2079 }
2080 
2081 static int memcg_hotplug_cpu_dead(unsigned int cpu)
2082 {
2083         struct memcg_stock_pcp *stock;
2084         struct mem_cgroup *memcg;
2085 
2086         stock = &per_cpu(memcg_stock, cpu);
2087         drain_stock(stock);
2088 
2089         for_each_mem_cgroup(memcg) {
2090                 int i;
2091 
2092                 for (i = 0; i < MEMCG_NR_STAT; i++) {
2093                         int nid;
2094                         long x;
2095 
2096                         x = this_cpu_xchg(memcg->stat_cpu->count[i], 0);
2097                         if (x)
2098                                 atomic_long_add(x, &memcg->stat[i]);
2099 
2100                         if (i >= NR_VM_NODE_STAT_ITEMS)
2101                                 continue;
2102 
2103                         for_each_node(nid) {
2104                                 struct mem_cgroup_per_node *pn;
2105 
2106                                 pn = mem_cgroup_nodeinfo(memcg, nid);
2107                                 x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0);
2108                                 if (x)
2109                                         atomic_long_add(x, &pn->lruvec_stat[i]);
2110                         }
2111                 }
2112 
2113                 for (i = 0; i < NR_VM_EVENT_ITEMS; i++) {
2114                         long x;
2115 
2116                         x = this_cpu_xchg(memcg->stat_cpu->events[i], 0);
2117                         if (x)
2118                                 atomic_long_add(x, &memcg->events[i]);
2119                 }
2120         }
2121 
2122         return 0;
2123 }
2124 
2125 static void reclaim_high(struct mem_cgroup *memcg,
2126                          unsigned int nr_pages,
2127                          gfp_t gfp_mask)
2128 {
2129         do {
2130                 if (page_counter_read(&memcg->memory) <= memcg->high)
2131                         continue;
2132                 memcg_memory_event(memcg, MEMCG_HIGH);
2133                 try_to_free_mem_cgroup_pages(memcg, nr_pages, gfp_mask, true);
2134         } while ((memcg = parent_mem_cgroup(memcg)));
2135 }
2136 
2137 static void high_work_func(struct work_struct *work)
2138 {
2139         struct mem_cgroup *memcg;
2140 
2141         memcg = container_of(work, struct mem_cgroup, high_work);
2142         reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL);
2143 }
2144 
2145 /*
2146  * Scheduled by try_charge() to be executed from the userland return path
2147  * and reclaims memory over the high limit.
2148  */
2149 void mem_cgroup_handle_over_high(void)
2150 {
2151         unsigned int nr_pages = current->memcg_nr_pages_over_high;
2152         struct mem_cgroup *memcg;
2153 
2154         if (likely(!nr_pages))
2155                 return;
2156 
2157         memcg = get_mem_cgroup_from_mm(current->mm);
2158         reclaim_high(memcg, nr_pages, GFP_KERNEL);
2159         css_put(&memcg->css);
2160         current->memcg_nr_pages_over_high = 0;
2161 }
2162 
2163 static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
2164                       unsigned int nr_pages)
2165 {
2166         unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages);
2167         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2168         struct mem_cgroup *mem_over_limit;
2169         struct page_counter *counter;
2170         unsigned long nr_reclaimed;
2171         bool may_swap = true;
2172         bool drained = false;
2173         bool oomed = false;
2174         enum oom_status oom_status;
2175 
2176         if (mem_cgroup_is_root(memcg))
2177                 return 0;
2178 retry:
2179         if (consume_stock(memcg, nr_pages))
2180                 return 0;
2181 
2182         if (!do_memsw_account() ||
2183             page_counter_try_charge(&memcg->memsw, batch, &counter)) {
2184                 if (page_counter_try_charge(&memcg->memory, batch, &counter))
2185                         goto done_restock;
2186                 if (do_memsw_account())
2187                         page_counter_uncharge(&memcg->memsw, batch);
2188                 mem_over_limit = mem_cgroup_from_counter(counter, memory);
2189         } else {
2190                 mem_over_limit = mem_cgroup_from_counter(counter, memsw);
2191                 may_swap = false;
2192         }
2193 
2194         if (batch > nr_pages) {
2195                 batch = nr_pages;
2196                 goto retry;
2197         }
2198 
2199         /*
2200          * Unlike in global OOM situations, memcg is not in a physical
2201          * memory shortage.  Allow dying and OOM-killed tasks to
2202          * bypass the last charges so that they can exit quickly and
2203          * free their memory.
2204          */
2205         if (unlikely(tsk_is_oom_victim(current) ||
2206                      fatal_signal_pending(current) ||
2207                      current->flags & PF_EXITING))
2208                 goto force;
2209 
2210         /*
2211          * Prevent unbounded recursion when reclaim operations need to
2212          * allocate memory. This might exceed the limits temporarily,
2213          * but we prefer facilitating memory reclaim and getting back
2214          * under the limit over triggering OOM kills in these cases.
2215          */
2216         if (unlikely(current->flags & PF_MEMALLOC))
2217                 goto force;
2218 
2219         if (unlikely(task_in_memcg_oom(current)))
2220                 goto nomem;
2221 
2222         if (!gfpflags_allow_blocking(gfp_mask))
2223                 goto nomem;
2224 
2225         memcg_memory_event(mem_over_limit, MEMCG_MAX);
2226 
2227         nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
2228                                                     gfp_mask, may_swap);
2229 
2230         if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
2231                 goto retry;
2232 
2233         if (!drained) {
2234                 drain_all_stock(mem_over_limit);
2235                 drained = true;
2236                 goto retry;
2237         }
2238 
2239         if (gfp_mask & __GFP_NORETRY)
2240                 goto nomem;
2241         /*
2242          * Even though the limit is exceeded at this point, reclaim
2243          * may have been able to free some pages.  Retry the charge
2244          * before killing the task.
2245          *
2246          * Only for regular pages, though: huge pages are rather
2247          * unlikely to succeed so close to the limit, and we fall back
2248          * to regular pages anyway in case of failure.
2249          */
2250         if (nr_reclaimed && nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER))
2251                 goto retry;
2252         /*
2253          * At task move, charge accounts can be doubly counted. So, it's
2254          * better to wait until the end of task_move if something is going on.
2255          */
2256         if (mem_cgroup_wait_acct_move(mem_over_limit))
2257                 goto retry;
2258 
2259         if (nr_retries--)
2260                 goto retry;
2261 
2262         if (gfp_mask & __GFP_RETRY_MAYFAIL && oomed)
2263                 goto nomem;
2264 
2265         if (gfp_mask & __GFP_NOFAIL)
2266                 goto force;
2267 
2268         if (fatal_signal_pending(current))
2269                 goto force;
2270 
2271         /*
2272          * keep retrying as long as the memcg oom killer is able to make
2273          * a forward progress or bypass the charge if the oom killer
2274          * couldn't make any progress.
2275          */
2276         oom_status = mem_cgroup_oom(mem_over_limit, gfp_mask,
2277                        get_order(nr_pages * PAGE_SIZE));
2278         switch (oom_status) {
2279         case OOM_SUCCESS:
2280                 nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2281                 oomed = true;
2282                 goto retry;
2283         case OOM_FAILED:
2284                 goto force;
2285         default:
2286                 goto nomem;
2287         }
2288 nomem:
2289         if (!(gfp_mask & __GFP_NOFAIL))
2290                 return -ENOMEM;
2291 force:
2292         /*
2293          * The allocation either can't fail or will lead to more memory
2294          * being freed very soon.  Allow memory usage go over the limit
2295          * temporarily by force charging it.
2296          */
2297         page_counter_charge(&memcg->memory, nr_pages);
2298         if (do_memsw_account())
2299                 page_counter_charge(&memcg->memsw, nr_pages);
2300         css_get_many(&memcg->css, nr_pages);
2301 
2302         return 0;
2303 
2304 done_restock:
2305         css_get_many(&memcg->css, batch);
2306         if (batch > nr_pages)
2307                 refill_stock(memcg, batch - nr_pages);
2308 
2309         /*
2310          * If the hierarchy is above the normal consumption range, schedule
2311          * reclaim on returning to userland.  We can perform reclaim here
2312          * if __GFP_RECLAIM but let's always punt for simplicity and so that
2313          * GFP_KERNEL can consistently be used during reclaim.  @memcg is
2314          * not recorded as it most likely matches current's and won't
2315          * change in the meantime.  As high limit is checked again before
2316          * reclaim, the cost of mismatch is negligible.
2317          */
2318         do {
2319                 if (page_counter_read(&memcg->memory) > memcg->high) {
2320                         /* Don't bother a random interrupted task */
2321                         if (in_interrupt()) {
2322                                 schedule_work(&memcg->high_work);
2323                                 break;
2324                         }
2325                         current->memcg_nr_pages_over_high += batch;
2326                         set_notify_resume(current);
2327                         break;
2328                 }
2329         } while ((memcg = parent_mem_cgroup(memcg)));
2330 
2331         return 0;
2332 }
2333 
2334 static void cancel_charge(struct mem_cgroup *memcg, unsigned int nr_pages)
2335 {
2336         if (mem_cgroup_is_root(memcg))
2337                 return;
2338 
2339         page_counter_uncharge(&memcg->memory, nr_pages);
2340         if (do_memsw_account())
2341                 page_counter_uncharge(&memcg->memsw, nr_pages);
2342 
2343         css_put_many(&memcg->css, nr_pages);
2344 }
2345 
2346 static void lock_page_lru(struct page *page, int *isolated)
2347 {
2348         struct zone *zone = page_zone(page);
2349 
2350         spin_lock_irq(zone_lru_lock(zone));
2351         if (PageLRU(page)) {
2352                 struct lruvec *lruvec;
2353 
2354                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2355                 ClearPageLRU(page);
2356                 del_page_from_lru_list(page, lruvec, page_lru(page));
2357                 *isolated = 1;
2358         } else
2359                 *isolated = 0;
2360 }
2361 
2362 static void unlock_page_lru(struct page *page, int isolated)
2363 {
2364         struct zone *zone = page_zone(page);
2365 
2366         if (isolated) {
2367                 struct lruvec *lruvec;
2368 
2369                 lruvec = mem_cgroup_page_lruvec(page, zone->zone_pgdat);
2370                 VM_BUG_ON_PAGE(PageLRU(page), page);
2371                 SetPageLRU(page);
2372                 add_page_to_lru_list(page, lruvec, page_lru(page));
2373         }
2374         spin_unlock_irq(zone_lru_lock(zone));
2375 }
2376 
2377 static void commit_charge(struct page *page, struct mem_cgroup *memcg,
2378                           bool lrucare)
2379 {
2380         int isolated;
2381 
2382         VM_BUG_ON_PAGE(page->mem_cgroup, page);
2383 
2384         /*
2385          * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page
2386          * may already be on some other mem_cgroup's LRU.  Take care of it.
2387          */
2388         if (lrucare)
2389                 lock_page_lru(page, &isolated);
2390 
2391         /*
2392          * Nobody should be changing or seriously looking at
2393          * page->mem_cgroup at this point:
2394          *
2395          * - the page is uncharged
2396          *
2397          * - the page is off-LRU
2398          *
2399          * - an anonymous fault has exclusive page access, except for
2400          *   a locked page table
2401          *
2402          * - a page cache insertion, a swapin fault, or a migration
2403          *   have the page locked
2404          */
2405         page->mem_cgroup = memcg;
2406 
2407         if (lrucare)
2408                 unlock_page_lru(page, isolated);
2409 }
2410 
2411 #ifdef CONFIG_MEMCG_KMEM
2412 static int memcg_alloc_cache_id(void)
2413 {
2414         int id, size;
2415         int err;
2416 
2417         id = ida_simple_get(&memcg_cache_ida,
2418                             0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL);
2419         if (id < 0)
2420                 return id;
2421 
2422         if (id < memcg_nr_cache_ids)
2423                 return id;
2424 
2425         /*
2426          * There's no space for the new id in memcg_caches arrays,
2427          * so we have to grow them.
2428          */
2429         down_write(&memcg_cache_ids_sem);
2430 
2431         size = 2 * (id + 1);
2432         if (size < MEMCG_CACHES_MIN_SIZE)
2433                 size = MEMCG_CACHES_MIN_SIZE;
2434         else if (size > MEMCG_CACHES_MAX_SIZE)
2435                 size = MEMCG_CACHES_MAX_SIZE;
2436 
2437         err = memcg_update_all_caches(size);
2438         if (!err)
2439                 err = memcg_update_all_list_lrus(size);
2440         if (!err)
2441                 memcg_nr_cache_ids = size;
2442 
2443         up_write(&memcg_cache_ids_sem);
2444 
2445         if (err) {
2446                 ida_simple_remove(&memcg_cache_ida, id);
2447                 return err;
2448         }
2449         return id;
2450 }
2451 
2452 static void memcg_free_cache_id(int id)
2453 {
2454         ida_simple_remove(&memcg_cache_ida, id);
2455 }
2456 
2457 struct memcg_kmem_cache_create_work {
2458         struct mem_cgroup *memcg;
2459         struct kmem_cache *cachep;
2460         struct work_struct work;
2461 };
2462 
2463 static void memcg_kmem_cache_create_func(struct work_struct *w)
2464 {
2465         struct memcg_kmem_cache_create_work *cw =
2466                 container_of(w, struct memcg_kmem_cache_create_work, work);
2467         struct mem_cgroup *memcg = cw->memcg;
2468         struct kmem_cache *cachep = cw->cachep;
2469 
2470         memcg_create_kmem_cache(memcg, cachep);
2471 
2472         css_put(&memcg->css);
2473         kfree(cw);
2474 }
2475 
2476 /*
2477  * Enqueue the creation of a per-memcg kmem_cache.
2478  */
2479 static void memcg_schedule_kmem_cache_create(struct mem_cgroup *memcg,
2480                                                struct kmem_cache *cachep)
2481 {
2482         struct memcg_kmem_cache_create_work *cw;
2483 
2484         cw = kmalloc(sizeof(*cw), GFP_NOWAIT | __GFP_NOWARN);
2485         if (!cw)
2486                 return;
2487 
2488         css_get(&memcg->css);
2489 
2490         cw->memcg = memcg;
2491         cw->cachep = cachep;
2492         INIT_WORK(&cw->work, memcg_kmem_cache_create_func);
2493 
2494         queue_work(memcg_kmem_cache_wq, &cw->work);
2495 }
2496 
2497 static inline bool memcg_kmem_bypass(void)
2498 {
2499         if (in_interrupt() || !current->mm || (current->flags & PF_KTHREAD))
2500                 return true;
2501         return false;
2502 }
2503 
2504 /**
2505  * memcg_kmem_get_cache: select the correct per-memcg cache for allocation
2506  * @cachep: the original global kmem cache
2507  *
2508  * Return the kmem_cache we're supposed to use for a slab allocation.
2509  * We try to use the current memcg's version of the cache.
2510  *
2511  * If the cache does not exist yet, if we are the first user of it, we
2512  * create it asynchronously in a workqueue and let the current allocation
2513  * go through with the original cache.
2514  *
2515  * This function takes a reference to the cache it returns to assure it
2516  * won't get destroyed while we are working with it. Once the caller is
2517  * done with it, memcg_kmem_put_cache() must be called to release the
2518  * reference.
2519  */
2520 struct kmem_cache *memcg_kmem_get_cache(struct kmem_cache *cachep)
2521 {
2522         struct mem_cgroup *memcg;
2523         struct kmem_cache *memcg_cachep;
2524         int kmemcg_id;
2525 
2526         VM_BUG_ON(!is_root_cache(cachep));
2527 
2528         if (memcg_kmem_bypass())
2529                 return cachep;
2530 
2531         memcg = get_mem_cgroup_from_current();
2532         kmemcg_id = READ_ONCE(memcg->kmemcg_id);
2533         if (kmemcg_id < 0)
2534                 goto out;
2535 
2536         memcg_cachep = cache_from_memcg_idx(cachep, kmemcg_id);
2537         if (likely(memcg_cachep))
2538                 return memcg_cachep;
2539 
2540         /*
2541          * If we are in a safe context (can wait, and not in interrupt
2542          * context), we could be be predictable and return right away.
2543          * This would guarantee that the allocation being performed
2544          * already belongs in the new cache.
2545          *
2546          * However, there are some clashes that can arrive from locking.
2547          * For instance, because we acquire the slab_mutex while doing
2548          * memcg_create_kmem_cache, this means no further allocation
2549          * could happen with the slab_mutex held. So it's better to
2550          * defer everything.
2551          */
2552         memcg_schedule_kmem_cache_create(memcg, cachep);
2553 out:
2554         css_put(&memcg->css);
2555         return cachep;
2556 }
2557 
2558 /**
2559  * memcg_kmem_put_cache: drop reference taken by memcg_kmem_get_cache
2560  * @cachep: the cache returned by memcg_kmem_get_cache
2561  */
2562 void memcg_kmem_put_cache(struct kmem_cache *cachep)
2563 {
2564         if (!is_root_cache(cachep))
2565                 css_put(&cachep->memcg_params.memcg->css);
2566 }
2567 
2568 /**
2569  * memcg_kmem_charge_memcg: charge a kmem page
2570  * @page: page to charge
2571  * @gfp: reclaim mode
2572  * @order: allocation order
2573  * @memcg: memory cgroup to charge
2574  *
2575  * Returns 0 on success, an error code on failure.
2576  */
2577 int memcg_kmem_charge_memcg(struct page *page, gfp_t gfp, int order,
2578                             struct mem_cgroup *memcg)
2579 {
2580         unsigned int nr_pages = 1 << order;
2581         struct page_counter *counter;
2582         int ret;
2583 
2584         ret = try_charge(memcg, gfp, nr_pages);
2585         if (ret)
2586                 return ret;
2587 
2588         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) &&
2589             !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) {
2590                 cancel_charge(memcg, nr_pages);
2591                 return -ENOMEM;
2592         }
2593 
2594         page->mem_cgroup = memcg;
2595 
2596         return 0;
2597 }
2598 
2599 /**
2600  * memcg_kmem_charge: charge a kmem page to the current memory cgroup
2601  * @page: page to charge
2602  * @gfp: reclaim mode
2603  * @order: allocation order
2604  *
2605  * Returns 0 on success, an error code on failure.
2606  */
2607 int memcg_kmem_charge(struct page *page, gfp_t gfp, int order)
2608 {
2609         struct mem_cgroup *memcg;
2610         int ret = 0;
2611 
2612         if (mem_cgroup_disabled() || memcg_kmem_bypass())
2613                 return 0;
2614 
2615         memcg = get_mem_cgroup_from_current();
2616         if (!mem_cgroup_is_root(memcg)) {
2617                 ret = memcg_kmem_charge_memcg(page, gfp, order, memcg);
2618                 if (!ret)
2619                         __SetPageKmemcg(page);
2620         }
2621         css_put(&memcg->css);
2622         return ret;
2623 }
2624 /**
2625  * memcg_kmem_uncharge: uncharge a kmem page
2626  * @page: page to uncharge
2627  * @order: allocation order
2628  */
2629 void memcg_kmem_uncharge(struct page *page, int order)
2630 {
2631         struct mem_cgroup *memcg = page->mem_cgroup;
2632         unsigned int nr_pages = 1 << order;
2633 
2634         if (!memcg)
2635                 return;
2636 
2637         VM_BUG_ON_PAGE(mem_cgroup_is_root(memcg), page);
2638 
2639         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys))
2640                 page_counter_uncharge(&memcg->kmem, nr_pages);
2641 
2642         page_counter_uncharge(&memcg->memory, nr_pages);
2643         if (do_memsw_account())
2644                 page_counter_uncharge(&memcg->memsw, nr_pages);
2645 
2646         page->mem_cgroup = NULL;
2647 
2648         /* slab pages do not have PageKmemcg flag set */
2649         if (PageKmemcg(page))
2650                 __ClearPageKmemcg(page);
2651 
2652         css_put_many(&memcg->css, nr_pages);
2653 }
2654 #endif /* CONFIG_MEMCG_KMEM */
2655 
2656 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
2657 
2658 /*
2659  * Because tail pages are not marked as "used", set it. We're under
2660  * zone_lru_lock and migration entries setup in all page mappings.
2661  */
2662 void mem_cgroup_split_huge_fixup(struct page *head)
2663 {
2664         int i;
2665 
2666         if (mem_cgroup_disabled())
2667                 return;
2668 
2669         for (i = 1; i < HPAGE_PMD_NR; i++)
2670                 head[i].mem_cgroup = head->mem_cgroup;
2671 
2672         __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR);
2673 }
2674 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */
2675 
2676 #ifdef CONFIG_MEMCG_SWAP
2677 /**
2678  * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record.
2679  * @entry: swap entry to be moved
2680  * @from:  mem_cgroup which the entry is moved from
2681  * @to:  mem_cgroup which the entry is moved to
2682  *
2683  * It succeeds only when the swap_cgroup's record for this entry is the same
2684  * as the mem_cgroup's id of @from.
2685  *
2686  * Returns 0 on success, -EINVAL on failure.
2687  *
2688  * The caller must have charged to @to, IOW, called page_counter_charge() about
2689  * both res and memsw, and called css_get().
2690  */
2691 static int mem_cgroup_move_swap_account(swp_entry_t entry,
2692                                 struct mem_cgroup *from, struct mem_cgroup *to)
2693 {
2694         unsigned short old_id, new_id;
2695 
2696         old_id = mem_cgroup_id(from);
2697         new_id = mem_cgroup_id(to);
2698 
2699         if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) {
2700                 mod_memcg_state(from, MEMCG_SWAP, -1);
2701                 mod_memcg_state(to, MEMCG_SWAP, 1);
2702                 return 0;
2703         }
2704         return -EINVAL;
2705 }
2706 #else
2707 static inline int mem_cgroup_move_swap_account(swp_entry_t entry,
2708                                 struct mem_cgroup *from, struct mem_cgroup *to)
2709 {
2710         return -EINVAL;
2711 }
2712 #endif
2713 
2714 static DEFINE_MUTEX(memcg_max_mutex);
2715 
2716 static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
2717                                  unsigned long max, bool memsw)
2718 {
2719         bool enlarge = false;
2720         bool drained = false;
2721         int ret;
2722         bool limits_invariant;
2723         struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory;
2724 
2725         do {
2726                 if (signal_pending(current)) {
2727                         ret = -EINTR;
2728                         break;
2729                 }
2730 
2731                 mutex_lock(&memcg_max_mutex);
2732                 /*
2733                  * Make sure that the new limit (memsw or memory limit) doesn't
2734                  * break our basic invariant rule memory.max <= memsw.max.
2735                  */
2736                 limits_invariant = memsw ? max >= memcg->memory.max :
2737                                            max <= memcg->memsw.max;
2738                 if (!limits_invariant) {
2739                         mutex_unlock(&memcg_max_mutex);
2740                         ret = -EINVAL;
2741                         break;
2742                 }
2743                 if (max > counter->max)
2744                         enlarge = true;
2745                 ret = page_counter_set_max(counter, max);
2746                 mutex_unlock(&memcg_max_mutex);
2747 
2748                 if (!ret)
2749                         break;
2750 
2751                 if (!drained) {
2752                         drain_all_stock(memcg);
2753                         drained = true;
2754                         continue;
2755                 }
2756 
2757                 if (!try_to_free_mem_cgroup_pages(memcg, 1,
2758                                         GFP_KERNEL, !memsw)) {
2759                         ret = -EBUSY;
2760                         break;
2761                 }
2762         } while (true);
2763 
2764         if (!ret && enlarge)
2765                 memcg_oom_recover(memcg);
2766 
2767         return ret;
2768 }
2769 
2770 unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order,
2771                                             gfp_t gfp_mask,
2772                                             unsigned long *total_scanned)
2773 {
2774         unsigned long nr_reclaimed = 0;
2775         struct mem_cgroup_per_node *mz, *next_mz = NULL;
2776         unsigned long reclaimed;
2777         int loop = 0;
2778         struct mem_cgroup_tree_per_node *mctz;
2779         unsigned long excess;
2780         unsigned long nr_scanned;
2781 
2782         if (order > 0)
2783                 return 0;
2784 
2785         mctz = soft_limit_tree_node(pgdat->node_id);
2786 
2787         /*
2788          * Do not even bother to check the largest node if the root
2789          * is empty. Do it lockless to prevent lock bouncing. Races
2790          * are acceptable as soft limit is best effort anyway.
2791          */
2792         if (!mctz || RB_EMPTY_ROOT(&mctz->rb_root))
2793                 return 0;
2794 
2795         /*
2796          * This loop can run a while, specially if mem_cgroup's continuously
2797          * keep exceeding their soft limit and putting the system under
2798          * pressure
2799          */
2800         do {
2801                 if (next_mz)
2802                         mz = next_mz;
2803                 else
2804                         mz = mem_cgroup_largest_soft_limit_node(mctz);
2805                 if (!mz)
2806                         break;
2807 
2808                 nr_scanned = 0;
2809                 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, pgdat,
2810                                                     gfp_mask, &nr_scanned);
2811                 nr_reclaimed += reclaimed;
2812                 *total_scanned += nr_scanned;
2813                 spin_lock_irq(&mctz->lock);
2814                 __mem_cgroup_remove_exceeded(mz, mctz);
2815 
2816                 /*
2817                  * If we failed to reclaim anything from this memory cgroup
2818                  * it is time to move on to the next cgroup
2819                  */
2820                 next_mz = NULL;
2821                 if (!reclaimed)
2822                         next_mz = __mem_cgroup_largest_soft_limit_node(mctz);
2823 
2824                 excess = soft_limit_excess(mz->memcg);
2825                 /*
2826                  * One school of thought says that we should not add
2827                  * back the node to the tree if reclaim returns 0.
2828                  * But our reclaim could return 0, simply because due
2829                  * to priority we are exposing a smaller subset of
2830                  * memory to reclaim from. Consider this as a longer
2831                  * term TODO.
2832                  */
2833                 /* If excess == 0, no tree ops */
2834                 __mem_cgroup_insert_exceeded(mz, mctz, excess);
2835                 spin_unlock_irq(&mctz->lock);
2836                 css_put(&mz->memcg->css);
2837                 loop++;
2838                 /*
2839                  * Could not reclaim anything and there are no more
2840                  * mem cgroups to try or we seem to be looping without
2841                  * reclaiming anything.
2842                  */
2843                 if (!nr_reclaimed &&
2844                         (next_mz == NULL ||
2845                         loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS))
2846                         break;
2847         } while (!nr_reclaimed);
2848         if (next_mz)
2849                 css_put(&next_mz->memcg->css);
2850         return nr_reclaimed;
2851 }
2852 
2853 /*
2854  * Test whether @memcg has children, dead or alive.  Note that this
2855  * function doesn't care whether @memcg has use_hierarchy enabled and
2856  * returns %true if there are child csses according to the cgroup
2857  * hierarchy.  Testing use_hierarchy is the caller's responsiblity.
2858  */
2859 static inline bool memcg_has_children(struct mem_cgroup *memcg)
2860 {
2861         bool ret;
2862 
2863         rcu_read_lock();
2864         ret = css_next_child(NULL, &memcg->css);
2865         rcu_read_unlock();
2866         return ret;
2867 }
2868 
2869 /*
2870  * Reclaims as many pages from the given memcg as possible.
2871  *
2872  * Caller is responsible for holding css reference for memcg.
2873  */
2874 static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
2875 {
2876         int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
2877 
2878         /* we call try-to-free pages for make this cgroup empty */
2879         lru_add_drain_all();
2880 
2881         drain_all_stock(memcg);
2882 
2883         /* try to free all pages in this cgroup */
2884         while (nr_retries && page_counter_read(&memcg->memory)) {
2885                 int progress;
2886 
2887                 if (signal_pending(current))
2888                         return -EINTR;
2889 
2890                 progress = try_to_free_mem_cgroup_pages(memcg, 1,
2891                                                         GFP_KERNEL, true);
2892                 if (!progress) {
2893                         nr_retries--;
2894                         /* maybe some writeback is necessary */
2895                         congestion_wait(BLK_RW_ASYNC, HZ/10);
2896                 }
2897 
2898         }
2899 
2900         return 0;
2901 }
2902 
2903 static ssize_t mem_cgroup_force_empty_write(struct kernfs_open_file *of,
2904                                             char *buf, size_t nbytes,
2905                                             loff_t off)
2906 {
2907         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
2908 
2909         if (mem_cgroup_is_root(memcg))
2910                 return -EINVAL;
2911         return mem_cgroup_force_empty(memcg) ?: nbytes;
2912 }
2913 
2914 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css,
2915                                      struct cftype *cft)
2916 {
2917         return mem_cgroup_from_css(css)->use_hierarchy;
2918 }
2919 
2920 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css,
2921                                       struct cftype *cft, u64 val)
2922 {
2923         int retval = 0;
2924         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
2925         struct mem_cgroup *parent_memcg = mem_cgroup_from_css(memcg->css.parent);
2926 
2927         if (memcg->use_hierarchy == val)
2928                 return 0;
2929 
2930         /*
2931          * If parent's use_hierarchy is set, we can't make any modifications
2932          * in the child subtrees. If it is unset, then the change can
2933          * occur, provided the current cgroup has no children.
2934          *
2935          * For the root cgroup, parent_mem is NULL, we allow value to be
2936          * set if there are no children.
2937          */
2938         if ((!parent_memcg || !parent_memcg->use_hierarchy) &&
2939                                 (val == 1 || val == 0)) {
2940                 if (!memcg_has_children(memcg))
2941                         memcg->use_hierarchy = val;
2942                 else
2943                         retval = -EBUSY;
2944         } else
2945                 retval = -EINVAL;
2946 
2947         return retval;
2948 }
2949 
2950 struct accumulated_stats {
2951         unsigned long stat[MEMCG_NR_STAT];
2952         unsigned long events[NR_VM_EVENT_ITEMS];
2953         unsigned long lru_pages[NR_LRU_LISTS];
2954         const unsigned int *stats_array;
2955         const unsigned int *events_array;
2956         int stats_size;
2957         int events_size;
2958 };
2959 
2960 static void accumulate_memcg_tree(struct mem_cgroup *memcg,
2961                                   struct accumulated_stats *acc)
2962 {
2963         struct mem_cgroup *mi;
2964         int i;
2965 
2966         for_each_mem_cgroup_tree(mi, memcg) {
2967                 for (i = 0; i < acc->stats_size; i++)
2968                         acc->stat[i] += memcg_page_state(mi,
2969                                 acc->stats_array ? acc->stats_array[i] : i);
2970 
2971                 for (i = 0; i < acc->events_size; i++)
2972                         acc->events[i] += memcg_sum_events(mi,
2973                                 acc->events_array ? acc->events_array[i] : i);
2974 
2975                 for (i = 0; i < NR_LRU_LISTS; i++)
2976                         acc->lru_pages[i] +=
2977                                 mem_cgroup_nr_lru_pages(mi, BIT(i));
2978         }
2979 }
2980 
2981 static unsigned long mem_cgroup_usage(struct mem_cgroup *memcg, bool swap)
2982 {
2983         unsigned long val = 0;
2984 
2985         if (mem_cgroup_is_root(memcg)) {
2986                 struct mem_cgroup *iter;
2987 
2988                 for_each_mem_cgroup_tree(iter, memcg) {
2989                         val += memcg_page_state(iter, MEMCG_CACHE);
2990                         val += memcg_page_state(iter, MEMCG_RSS);
2991                         if (swap)
2992                                 val += memcg_page_state(iter, MEMCG_SWAP);
2993                 }
2994         } else {
2995                 if (!swap)
2996                         val = page_counter_read(&memcg->memory);
2997                 else
2998                         val = page_counter_read(&memcg->memsw);
2999         }
3000         return val;
3001 }
3002 
3003 enum {
3004         RES_USAGE,
3005         RES_LIMIT,
3006         RES_MAX_USAGE,
3007         RES_FAILCNT,
3008         RES_SOFT_LIMIT,
3009 };
3010 
3011 static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css,
3012                                struct cftype *cft)
3013 {
3014         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3015         struct page_counter *counter;
3016 
3017         switch (MEMFILE_TYPE(cft->private)) {
3018         case _MEM:
3019                 counter = &memcg->memory;
3020                 break;
3021         case _MEMSWAP:
3022                 counter = &memcg->memsw;
3023                 break;
3024         case _KMEM:
3025                 counter = &memcg->kmem;
3026                 break;
3027         case _TCP:
3028                 counter = &memcg->tcpmem;
3029                 break;
3030         default:
3031                 BUG();
3032         }
3033 
3034         switch (MEMFILE_ATTR(cft->private)) {
3035         case RES_USAGE:
3036                 if (counter == &memcg->memory)
3037                         return (u64)mem_cgroup_usage(memcg, false) * PAGE_SIZE;
3038                 if (counter == &memcg->memsw)
3039                         return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE;
3040                 return (u64)page_counter_read(counter) * PAGE_SIZE;
3041         case RES_LIMIT:
3042                 return (u64)counter->max * PAGE_SIZE;
3043         case RES_MAX_USAGE:
3044                 return (u64)counter->watermark * PAGE_SIZE;
3045         case RES_FAILCNT:
3046                 return counter->failcnt;
3047         case RES_SOFT_LIMIT:
3048                 return (u64)memcg->soft_limit * PAGE_SIZE;
3049         default:
3050                 BUG();
3051         }
3052 }
3053 
3054 #ifdef CONFIG_MEMCG_KMEM
3055 static int memcg_online_kmem(struct mem_cgroup *memcg)
3056 {
3057         int memcg_id;
3058 
3059         if (cgroup_memory_nokmem)
3060                 return 0;
3061 
3062         BUG_ON(memcg->kmemcg_id >= 0);
3063         BUG_ON(memcg->kmem_state);
3064 
3065         memcg_id = memcg_alloc_cache_id();
3066         if (memcg_id < 0)
3067                 return memcg_id;
3068 
3069         static_branch_inc(&memcg_kmem_enabled_key);
3070         /*
3071          * A memory cgroup is considered kmem-online as soon as it gets
3072          * kmemcg_id. Setting the id after enabling static branching will
3073          * guarantee no one starts accounting before all call sites are
3074          * patched.
3075          */
3076         memcg->kmemcg_id = memcg_id;
3077         memcg->kmem_state = KMEM_ONLINE;
3078         INIT_LIST_HEAD(&memcg->kmem_caches);
3079 
3080         return 0;
3081 }
3082 
3083 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3084 {
3085         struct cgroup_subsys_state *css;
3086         struct mem_cgroup *parent, *child;
3087         int kmemcg_id;
3088 
3089         if (memcg->kmem_state != KMEM_ONLINE)
3090                 return;
3091         /*
3092          * Clear the online state before clearing memcg_caches array
3093          * entries. The slab_mutex in memcg_deactivate_kmem_caches()
3094          * guarantees that no cache will be created for this cgroup
3095          * after we are done (see memcg_create_kmem_cache()).
3096          */
3097         memcg->kmem_state = KMEM_ALLOCATED;
3098 
3099         memcg_deactivate_kmem_caches(memcg);
3100 
3101         kmemcg_id = memcg->kmemcg_id;
3102         BUG_ON(kmemcg_id < 0);
3103 
3104         parent = parent_mem_cgroup(memcg);
3105         if (!parent)
3106                 parent = root_mem_cgroup;
3107 
3108         /*
3109          * Change kmemcg_id of this cgroup and all its descendants to the
3110          * parent's id, and then move all entries from this cgroup's list_lrus
3111          * to ones of the parent. After we have finished, all list_lrus
3112          * corresponding to this cgroup are guaranteed to remain empty. The
3113          * ordering is imposed by list_lru_node->lock taken by
3114          * memcg_drain_all_list_lrus().
3115          */
3116         rcu_read_lock(); /* can be called from css_free w/o cgroup_mutex */
3117         css_for_each_descendant_pre(css, &memcg->css) {
3118                 child = mem_cgroup_from_css(css);
3119                 BUG_ON(child->kmemcg_id != kmemcg_id);
3120                 child->kmemcg_id = parent->kmemcg_id;
3121                 if (!memcg->use_hierarchy)
3122                         break;
3123         }
3124         rcu_read_unlock();
3125 
3126         memcg_drain_all_list_lrus(kmemcg_id, parent);
3127 
3128         memcg_free_cache_id(kmemcg_id);
3129 }
3130 
3131 static void memcg_free_kmem(struct mem_cgroup *memcg)
3132 {
3133         /* css_alloc() failed, offlining didn't happen */
3134         if (unlikely(memcg->kmem_state == KMEM_ONLINE))
3135                 memcg_offline_kmem(memcg);
3136 
3137         if (memcg->kmem_state == KMEM_ALLOCATED) {
3138                 memcg_destroy_kmem_caches(memcg);
3139                 static_branch_dec(&memcg_kmem_enabled_key);
3140                 WARN_ON(page_counter_read(&memcg->kmem));
3141         }
3142 }
3143 #else
3144 static int memcg_online_kmem(struct mem_cgroup *memcg)
3145 {
3146         return 0;
3147 }
3148 static void memcg_offline_kmem(struct mem_cgroup *memcg)
3149 {
3150 }
3151 static void memcg_free_kmem(struct mem_cgroup *memcg)
3152 {
3153 }
3154 #endif /* CONFIG_MEMCG_KMEM */
3155 
3156 static int memcg_update_kmem_max(struct mem_cgroup *memcg,
3157                                  unsigned long max)
3158 {
3159         int ret;
3160 
3161         mutex_lock(&memcg_max_mutex);
3162         ret = page_counter_set_max(&memcg->kmem, max);
3163         mutex_unlock(&memcg_max_mutex);
3164         return ret;
3165 }
3166 
3167 static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max)
3168 {
3169         int ret;
3170 
3171         mutex_lock(&memcg_max_mutex);
3172 
3173         ret = page_counter_set_max(&memcg->tcpmem, max);
3174         if (ret)
3175                 goto out;
3176 
3177         if (!memcg->tcpmem_active) {
3178                 /*
3179                  * The active flag needs to be written after the static_key
3180                  * update. This is what guarantees that the socket activation
3181                  * function is the last one to run. See mem_cgroup_sk_alloc()
3182                  * for details, and note that we don't mark any socket as
3183                  * belonging to this memcg until that flag is up.
3184                  *
3185                  * We need to do this, because static_keys will span multiple
3186                  * sites, but we can't control their order. If we mark a socket
3187                  * as accounted, but the accounting functions are not patched in
3188                  * yet, we'll lose accounting.
3189                  *
3190                  * We never race with the readers in mem_cgroup_sk_alloc(),
3191                  * because when this value change, the code to process it is not
3192                  * patched in yet.
3193                  */
3194                 static_branch_inc(&memcg_sockets_enabled_key);
3195                 memcg->tcpmem_active = true;
3196         }
3197 out:
3198         mutex_unlock(&memcg_max_mutex);
3199         return ret;
3200 }
3201 
3202 /*
3203  * The user of this function is...
3204  * RES_LIMIT.
3205  */
3206 static ssize_t mem_cgroup_write(struct kernfs_open_file *of,
3207                                 char *buf, size_t nbytes, loff_t off)
3208 {
3209         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3210         unsigned long nr_pages;
3211         int ret;
3212 
3213         buf = strstrip(buf);
3214         ret = page_counter_memparse(buf, "-1", &nr_pages);
3215         if (ret)
3216                 return ret;
3217 
3218         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3219         case RES_LIMIT:
3220                 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */
3221                         ret = -EINVAL;
3222                         break;
3223                 }
3224                 switch (MEMFILE_TYPE(of_cft(of)->private)) {
3225                 case _MEM:
3226                         ret = mem_cgroup_resize_max(memcg, nr_pages, false);
3227                         break;
3228                 case _MEMSWAP:
3229                         ret = mem_cgroup_resize_max(memcg, nr_pages, true);
3230                         break;
3231                 case _KMEM:
3232                         ret = memcg_update_kmem_max(memcg, nr_pages);
3233                         break;
3234                 case _TCP:
3235                         ret = memcg_update_tcp_max(memcg, nr_pages);
3236                         break;
3237                 }
3238                 break;
3239         case RES_SOFT_LIMIT:
3240                 memcg->soft_limit = nr_pages;
3241                 ret = 0;
3242                 break;
3243         }
3244         return ret ?: nbytes;
3245 }
3246 
3247 static ssize_t mem_cgroup_reset(struct kernfs_open_file *of, char *buf,
3248                                 size_t nbytes, loff_t off)
3249 {
3250         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
3251         struct page_counter *counter;
3252 
3253         switch (MEMFILE_TYPE(of_cft(of)->private)) {
3254         case _MEM:
3255                 counter = &memcg->memory;
3256                 break;
3257         case _MEMSWAP:
3258                 counter = &memcg->memsw;
3259                 break;
3260         case _KMEM:
3261                 counter = &memcg->kmem;
3262                 break;
3263         case _TCP:
3264                 counter = &memcg->tcpmem;
3265                 break;
3266         default:
3267                 BUG();
3268         }
3269 
3270         switch (MEMFILE_ATTR(of_cft(of)->private)) {
3271         case RES_MAX_USAGE:
3272                 page_counter_reset_watermark(counter);
3273                 break;
3274         case RES_FAILCNT:
3275                 counter->failcnt = 0;
3276                 break;
3277         default:
3278                 BUG();
3279         }
3280 
3281         return nbytes;
3282 }
3283 
3284 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css,
3285                                         struct cftype *cft)
3286 {
3287         return mem_cgroup_from_css(css)->move_charge_at_immigrate;
3288 }
3289 
3290 #ifdef CONFIG_MMU
3291 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3292                                         struct cftype *cft, u64 val)
3293 {
3294         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3295 
3296         if (val & ~MOVE_MASK)
3297                 return -EINVAL;
3298 
3299         /*
3300          * No kind of locking is needed in here, because ->can_attach() will
3301          * check this value once in the beginning of the process, and then carry
3302          * on with stale data. This means that changes to this value will only
3303          * affect task migrations starting after the change.
3304          */
3305         memcg->move_charge_at_immigrate = val;
3306         return 0;
3307 }
3308 #else
3309 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css,
3310                                         struct cftype *cft, u64 val)
3311 {
3312         return -ENOSYS;
3313 }
3314 #endif
3315 
3316 #ifdef CONFIG_NUMA
3317 static int memcg_numa_stat_show(struct seq_file *m, void *v)
3318 {
3319         struct numa_stat {
3320                 const char *name;
3321                 unsigned int lru_mask;
3322         };
3323 
3324         static const struct numa_stat stats[] = {
3325                 { "total", LRU_ALL },
3326                 { "file", LRU_ALL_FILE },
3327                 { "anon", LRU_ALL_ANON },
3328                 { "unevictable", BIT(LRU_UNEVICTABLE) },
3329         };
3330         const struct numa_stat *stat;
3331         int nid;
3332         unsigned long nr;
3333         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3334 
3335         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3336                 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask);
3337                 seq_printf(m, "%s=%lu", stat->name, nr);
3338                 for_each_node_state(nid, N_MEMORY) {
3339                         nr = mem_cgroup_node_nr_lru_pages(memcg, nid,
3340                                                           stat->lru_mask);
3341                         seq_printf(m, " N%d=%lu", nid, nr);
3342                 }
3343                 seq_putc(m, '\n');
3344         }
3345 
3346         for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
3347                 struct mem_cgroup *iter;
3348 
3349                 nr = 0;
3350                 for_each_mem_cgroup_tree(iter, memcg)
3351                         nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask);
3352                 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr);
3353                 for_each_node_state(nid, N_MEMORY) {
3354                         nr = 0;
3355                         for_each_mem_cgroup_tree(iter, memcg)
3356                                 nr += mem_cgroup_node_nr_lru_pages(
3357                                         iter, nid, stat->lru_mask);
3358                         seq_printf(m, " N%d=%lu", nid, nr);
3359                 }
3360                 seq_putc(m, '\n');
3361         }
3362 
3363         return 0;
3364 }
3365 #endif /* CONFIG_NUMA */
3366 
3367 /* Universal VM events cgroup1 shows, original sort order */
3368 static const unsigned int memcg1_events[] = {
3369         PGPGIN,
3370         PGPGOUT,
3371         PGFAULT,
3372         PGMAJFAULT,
3373 };
3374 
3375 static const char *const memcg1_event_names[] = {
3376         "pgpgin",
3377         "pgpgout",
3378         "pgfault",
3379         "pgmajfault",
3380 };
3381 
3382 static int memcg_stat_show(struct seq_file *m, void *v)
3383 {
3384         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
3385         unsigned long memory, memsw;
3386         struct mem_cgroup *mi;
3387         unsigned int i;
3388         struct accumulated_stats acc;
3389 
3390         BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
3391         BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS);
3392 
3393         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3394                 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3395                         continue;
3396                 seq_printf(m, "%s %lu\n", memcg1_stat_names[i],
3397                            memcg_page_state(memcg, memcg1_stats[i]) *
3398                            PAGE_SIZE);
3399         }
3400 
3401         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3402                 seq_printf(m, "%s %lu\n", memcg1_event_names[i],
3403                            memcg_sum_events(memcg, memcg1_events[i]));
3404 
3405         for (i = 0; i < NR_LRU_LISTS; i++)
3406                 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i],
3407                            mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE);
3408 
3409         /* Hierarchical information */
3410         memory = memsw = PAGE_COUNTER_MAX;
3411         for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) {
3412                 memory = min(memory, mi->memory.max);
3413                 memsw = min(memsw, mi->memsw.max);
3414         }
3415         seq_printf(m, "hierarchical_memory_limit %llu\n",
3416                    (u64)memory * PAGE_SIZE);
3417         if (do_memsw_account())
3418                 seq_printf(m, "hierarchical_memsw_limit %llu\n",
3419                            (u64)memsw * PAGE_SIZE);
3420 
3421         memset(&acc, 0, sizeof(acc));
3422         acc.stats_size = ARRAY_SIZE(memcg1_stats);
3423         acc.stats_array = memcg1_stats;
3424         acc.events_size = ARRAY_SIZE(memcg1_events);
3425         acc.events_array = memcg1_events;
3426         accumulate_memcg_tree(memcg, &acc);
3427 
3428         for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
3429                 if (memcg1_stats[i] == MEMCG_SWAP && !do_memsw_account())
3430                         continue;
3431                 seq_printf(m, "total_%s %llu\n", memcg1_stat_names[i],
3432                            (u64)acc.stat[i] * PAGE_SIZE);
3433         }
3434 
3435         for (i = 0; i < ARRAY_SIZE(memcg1_events); i++)
3436                 seq_printf(m, "total_%s %llu\n", memcg1_event_names[i],
3437                            (u64)acc.events[i]);
3438 
3439         for (i = 0; i < NR_LRU_LISTS; i++)
3440                 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i],
3441                            (u64)acc.lru_pages[i] * PAGE_SIZE);
3442 
3443 #ifdef CONFIG_DEBUG_VM
3444         {
3445                 pg_data_t *pgdat;
3446                 struct mem_cgroup_per_node *mz;
3447                 struct zone_reclaim_stat *rstat;
3448                 unsigned long recent_rotated[2] = {0, 0};
3449                 unsigned long recent_scanned[2] = {0, 0};
3450 
3451                 for_each_online_pgdat(pgdat) {
3452                         mz = mem_cgroup_nodeinfo(memcg, pgdat->node_id);
3453                         rstat = &mz->lruvec.reclaim_stat;
3454 
3455                         recent_rotated[0] += rstat->recent_rotated[0];
3456                         recent_rotated[1] += rstat->recent_rotated[1];
3457                         recent_scanned[0] += rstat->recent_scanned[0];
3458                         recent_scanned[1] += rstat->recent_scanned[1];
3459                 }
3460                 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]);
3461                 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]);
3462                 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]);
3463                 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]);
3464         }
3465 #endif
3466 
3467         return 0;
3468 }
3469 
3470 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css,
3471                                       struct cftype *cft)
3472 {
3473         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3474 
3475         return mem_cgroup_swappiness(memcg);
3476 }
3477 
3478 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css,
3479                                        struct cftype *cft, u64 val)
3480 {
3481         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3482 
3483         if (val > 100)
3484                 return -EINVAL;
3485 
3486         if (css->parent)
3487                 memcg->swappiness = val;
3488         else
3489                 vm_swappiness = val;
3490 
3491         return 0;
3492 }
3493 
3494 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap)
3495 {
3496         struct mem_cgroup_threshold_ary *t;
3497         unsigned long usage;
3498         int i;
3499 
3500         rcu_read_lock();
3501         if (!swap)
3502                 t = rcu_dereference(memcg->thresholds.primary);
3503         else
3504                 t = rcu_dereference(memcg->memsw_thresholds.primary);
3505 
3506         if (!t)
3507                 goto unlock;
3508 
3509         usage = mem_cgroup_usage(memcg, swap);
3510 
3511         /*
3512          * current_threshold points to threshold just below or equal to usage.
3513          * If it's not true, a threshold was crossed after last
3514          * call of __mem_cgroup_threshold().
3515          */
3516         i = t->current_threshold;
3517 
3518         /*
3519          * Iterate backward over array of thresholds starting from
3520          * current_threshold and check if a threshold is crossed.
3521          * If none of thresholds below usage is crossed, we read
3522          * only one element of the array here.
3523          */
3524         for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--)
3525                 eventfd_signal(t->entries[i].eventfd, 1);
3526 
3527         /* i = current_threshold + 1 */
3528         i++;
3529 
3530         /*
3531          * Iterate forward over array of thresholds starting from
3532          * current_threshold+1 and check if a threshold is crossed.
3533          * If none of thresholds above usage is crossed, we read
3534          * only one element of the array here.
3535          */
3536         for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++)
3537                 eventfd_signal(t->entries[i].eventfd, 1);
3538 
3539         /* Update current_threshold */
3540         t->current_threshold = i - 1;
3541 unlock:
3542         rcu_read_unlock();
3543 }
3544 
3545 static void mem_cgroup_threshold(struct mem_cgroup *memcg)
3546 {
3547         while (memcg) {
3548                 __mem_cgroup_threshold(memcg, false);
3549                 if (do_memsw_account())
3550                         __mem_cgroup_threshold(memcg, true);
3551 
3552                 memcg = parent_mem_cgroup(memcg);
3553         }
3554 }
3555 
3556 static int compare_thresholds(const void *a, const void *b)
3557 {
3558         const struct mem_cgroup_threshold *_a = a;
3559         const struct mem_cgroup_threshold *_b = b;
3560 
3561         if (_a->threshold > _b->threshold)
3562                 return 1;
3563 
3564         if (_a->threshold < _b->threshold)
3565                 return -1;
3566 
3567         return 0;
3568 }
3569 
3570 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg)
3571 {
3572         struct mem_cgroup_eventfd_list *ev;
3573 
3574         spin_lock(&memcg_oom_lock);
3575 
3576         list_for_each_entry(ev, &memcg->oom_notify, list)
3577                 eventfd_signal(ev->eventfd, 1);
3578 
3579         spin_unlock(&memcg_oom_lock);
3580         return 0;
3581 }
3582 
3583 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg)
3584 {
3585         struct mem_cgroup *iter;
3586 
3587         for_each_mem_cgroup_tree(iter, memcg)
3588                 mem_cgroup_oom_notify_cb(iter);
3589 }
3590 
3591 static int __mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3592         struct eventfd_ctx *eventfd, const char *args, enum res_type type)
3593 {
3594         struct mem_cgroup_thresholds *thresholds;
3595         struct mem_cgroup_threshold_ary *new;
3596         unsigned long threshold;
3597         unsigned long usage;
3598         int i, size, ret;
3599 
3600         ret = page_counter_memparse(args, "-1", &threshold);
3601         if (ret)
3602                 return ret;
3603 
3604         mutex_lock(&memcg->thresholds_lock);
3605 
3606         if (type == _MEM) {
3607                 thresholds = &memcg->thresholds;
3608                 usage = mem_cgroup_usage(memcg, false);
3609         } else if (type == _MEMSWAP) {
3610                 thresholds = &memcg->memsw_thresholds;
3611                 usage = mem_cgroup_usage(memcg, true);
3612         } else
3613                 BUG();
3614 
3615         /* Check if a threshold crossed before adding a new one */
3616         if (thresholds->primary)
3617                 __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3618 
3619         size = thresholds->primary ? thresholds->primary->size + 1 : 1;
3620 
3621         /* Allocate memory for new array of thresholds */
3622         new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold),
3623                         GFP_KERNEL);
3624         if (!new) {
3625                 ret = -ENOMEM;
3626                 goto unlock;
3627         }
3628         new->size = size;
3629 
3630         /* Copy thresholds (if any) to new array */
3631         if (thresholds->primary) {
3632                 memcpy(new->entries, thresholds->primary->entries, (size - 1) *
3633                                 sizeof(struct mem_cgroup_threshold));
3634         }
3635 
3636         /* Add new threshold */
3637         new->entries[size - 1].eventfd = eventfd;
3638         new->entries[size - 1].threshold = threshold;
3639 
3640         /* Sort thresholds. Registering of new threshold isn't time-critical */
3641         sort(new->entries, size, sizeof(struct mem_cgroup_threshold),
3642                         compare_thresholds, NULL);
3643 
3644         /* Find current threshold */
3645         new->current_threshold = -1;
3646         for (i = 0; i < size; i++) {
3647                 if (new->entries[i].threshold <= usage) {
3648                         /*
3649                          * new->current_threshold will not be used until
3650                          * rcu_assign_pointer(), so it's safe to increment
3651                          * it here.
3652                          */
3653                         ++new->current_threshold;
3654                 } else
3655                         break;
3656         }
3657 
3658         /* Free old spare buffer and save old primary buffer as spare */
3659         kfree(thresholds->spare);
3660         thresholds->spare = thresholds->primary;
3661 
3662         rcu_assign_pointer(thresholds->primary, new);
3663 
3664         /* To be sure that nobody uses thresholds */
3665         synchronize_rcu();
3666 
3667 unlock:
3668         mutex_unlock(&memcg->thresholds_lock);
3669 
3670         return ret;
3671 }
3672 
3673 static int mem_cgroup_usage_register_event(struct mem_cgroup *memcg,
3674         struct eventfd_ctx *eventfd, const char *args)
3675 {
3676         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEM);
3677 }
3678 
3679 static int memsw_cgroup_usage_register_event(struct mem_cgroup *memcg,
3680         struct eventfd_ctx *eventfd, const char *args)
3681 {
3682         return __mem_cgroup_usage_register_event(memcg, eventfd, args, _MEMSWAP);
3683 }
3684 
3685 static void __mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3686         struct eventfd_ctx *eventfd, enum res_type type)
3687 {
3688         struct mem_cgroup_thresholds *thresholds;
3689         struct mem_cgroup_threshold_ary *new;
3690         unsigned long usage;
3691         int i, j, size;
3692 
3693         mutex_lock(&memcg->thresholds_lock);
3694 
3695         if (type == _MEM) {
3696                 thresholds = &memcg->thresholds;
3697                 usage = mem_cgroup_usage(memcg, false);
3698         } else if (type == _MEMSWAP) {
3699                 thresholds = &memcg->memsw_thresholds;
3700                 usage = mem_cgroup_usage(memcg, true);
3701         } else
3702                 BUG();
3703 
3704         if (!thresholds->primary)
3705                 goto unlock;
3706 
3707         /* Check if a threshold crossed before removing */
3708         __mem_cgroup_threshold(memcg, type == _MEMSWAP);
3709 
3710         /* Calculate new number of threshold */
3711         size = 0;
3712         for (i = 0; i < thresholds->primary->size; i++) {
3713                 if (thresholds->primary->entries[i].eventfd != eventfd)
3714                         size++;
3715         }
3716 
3717         new = thresholds->spare;
3718 
3719         /* Set thresholds array to NULL if we don't have thresholds */
3720         if (!size) {
3721                 kfree(new);
3722                 new = NULL;
3723                 goto swap_buffers;
3724         }
3725 
3726         new->size = size;
3727 
3728         /* Copy thresholds and find current threshold */
3729         new->current_threshold = -1;
3730         for (i = 0, j = 0; i < thresholds->primary->size; i++) {
3731                 if (thresholds->primary->entries[i].eventfd == eventfd)
3732                         continue;
3733 
3734                 new->entries[j] = thresholds->primary->entries[i];
3735                 if (new->entries[j].threshold <= usage) {
3736                         /*
3737                          * new->current_threshold will not be used
3738                          * until rcu_assign_pointer(), so it's safe to increment
3739                          * it here.
3740                          */
3741                         ++new->current_threshold;
3742                 }
3743                 j++;
3744         }
3745 
3746 swap_buffers:
3747         /* Swap primary and spare array */
3748         thresholds->spare = thresholds->primary;
3749 
3750         rcu_assign_pointer(thresholds->primary, new);
3751 
3752         /* To be sure that nobody uses thresholds */
3753         synchronize_rcu();
3754 
3755         /* If all events are unregistered, free the spare array */
3756         if (!new) {
3757                 kfree(thresholds->spare);
3758                 thresholds->spare = NULL;
3759         }
3760 unlock:
3761         mutex_unlock(&memcg->thresholds_lock);
3762 }
3763 
3764 static void mem_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3765         struct eventfd_ctx *eventfd)
3766 {
3767         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEM);
3768 }
3769 
3770 static void memsw_cgroup_usage_unregister_event(struct mem_cgroup *memcg,
3771         struct eventfd_ctx *eventfd)
3772 {
3773         return __mem_cgroup_usage_unregister_event(memcg, eventfd, _MEMSWAP);
3774 }
3775 
3776 static int mem_cgroup_oom_register_event(struct mem_cgroup *memcg,
3777         struct eventfd_ctx *eventfd, const char *args)
3778 {
3779         struct mem_cgroup_eventfd_list *event;
3780 
3781         event = kmalloc(sizeof(*event), GFP_KERNEL);
3782         if (!event)
3783                 return -ENOMEM;
3784 
3785         spin_lock(&memcg_oom_lock);
3786 
3787         event->eventfd = eventfd;
3788         list_add(&event->list, &memcg->oom_notify);
3789 
3790         /* already in OOM ? */
3791         if (memcg->under_oom)
3792                 eventfd_signal(eventfd, 1);
3793         spin_unlock(&memcg_oom_lock);
3794 
3795         return 0;
3796 }
3797 
3798 static void mem_cgroup_oom_unregister_event(struct mem_cgroup *memcg,
3799         struct eventfd_ctx *eventfd)
3800 {
3801         struct mem_cgroup_eventfd_list *ev, *tmp;
3802 
3803         spin_lock(&memcg_oom_lock);
3804 
3805         list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) {
3806                 if (ev->eventfd == eventfd) {
3807                         list_del(&ev->list);
3808                         kfree(ev);
3809                 }
3810         }
3811 
3812         spin_unlock(&memcg_oom_lock);
3813 }
3814 
3815 static int mem_cgroup_oom_control_read(struct seq_file *sf, void *v)
3816 {
3817         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(sf));
3818 
3819         seq_printf(sf, "oom_kill_disable %d\n", memcg->oom_kill_disable);
3820         seq_printf(sf, "under_oom %d\n", (bool)memcg->under_oom);
3821         seq_printf(sf, "oom_kill %lu\n",
3822                    atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
3823         return 0;
3824 }
3825 
3826 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css,
3827         struct cftype *cft, u64 val)
3828 {
3829         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
3830 
3831         /* cannot set to root cgroup and only 0 and 1 are allowed */
3832         if (!css->parent || !((val == 0) || (val == 1)))
3833                 return -EINVAL;
3834 
3835         memcg->oom_kill_disable = val;
3836         if (!val)
3837                 memcg_oom_recover(memcg);
3838 
3839         return 0;
3840 }
3841 
3842 #ifdef CONFIG_CGROUP_WRITEBACK
3843 
3844 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3845 {
3846         return wb_domain_init(&memcg->cgwb_domain, gfp);
3847 }
3848 
3849 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3850 {
3851         wb_domain_exit(&memcg->cgwb_domain);
3852 }
3853 
3854 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3855 {
3856         wb_domain_size_changed(&memcg->cgwb_domain);
3857 }
3858 
3859 struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb)
3860 {
3861         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3862 
3863         if (!memcg->css.parent)
3864                 return NULL;
3865 
3866         return &memcg->cgwb_domain;
3867 }
3868 
3869 /**
3870  * mem_cgroup_wb_stats - retrieve writeback related stats from its memcg
3871  * @wb: bdi_writeback in question
3872  * @pfilepages: out parameter for number of file pages
3873  * @pheadroom: out parameter for number of allocatable pages according to memcg
3874  * @pdirty: out parameter for number of dirty pages
3875  * @pwriteback: out parameter for number of pages under writeback
3876  *
3877  * Determine the numbers of file, headroom, dirty, and writeback pages in
3878  * @wb's memcg.  File, dirty and writeback are self-explanatory.  Headroom
3879  * is a bit more involved.
3880  *
3881  * A memcg's headroom is "min(max, high) - used".  In the hierarchy, the
3882  * headroom is calculated as the lowest headroom of itself and the
3883  * ancestors.  Note that this doesn't consider the actual amount of
3884  * available memory in the system.  The caller should further cap
3885  * *@pheadroom accordingly.
3886  */
3887 void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages,
3888                          unsigned long *pheadroom, unsigned long *pdirty,
3889                          unsigned long *pwriteback)
3890 {
3891         struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
3892         struct mem_cgroup *parent;
3893 
3894         *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
3895 
3896         /* this should eventually include NR_UNSTABLE_NFS */
3897         *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
3898         *pfilepages = mem_cgroup_nr_lru_pages(memcg, (1 << LRU_INACTIVE_FILE) |
3899                                                      (1 << LRU_ACTIVE_FILE));
3900         *pheadroom = PAGE_COUNTER_MAX;
3901 
3902         while ((parent = parent_mem_cgroup(memcg))) {
3903                 unsigned long ceiling = min(memcg->memory.max, memcg->high);
3904                 unsigned long used = page_counter_read(&memcg->memory);
3905 
3906                 *pheadroom = min(*pheadroom, ceiling - min(ceiling, used));
3907                 memcg = parent;
3908         }
3909 }
3910 
3911 #else   /* CONFIG_CGROUP_WRITEBACK */
3912 
3913 static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp)
3914 {
3915         return 0;
3916 }
3917 
3918 static void memcg_wb_domain_exit(struct mem_cgroup *memcg)
3919 {
3920 }
3921 
3922 static void memcg_wb_domain_size_changed(struct mem_cgroup *memcg)
3923 {
3924 }
3925 
3926 #endif  /* CONFIG_CGROUP_WRITEBACK */
3927 
3928 /*
3929  * DO NOT USE IN NEW FILES.
3930  *
3931  * "cgroup.event_control" implementation.
3932  *
3933  * This is way over-engineered.  It tries to support fully configurable
3934  * events for each user.  Such level of flexibility is completely
3935  * unnecessary especially in the light of the planned unified hierarchy.
3936  *
3937  * Please deprecate this and replace with something simpler if at all
3938  * possible.
3939  */
3940 
3941 /*
3942  * Unregister event and free resources.
3943  *
3944  * Gets called from workqueue.
3945  */
3946 static void memcg_event_remove(struct work_struct *work)
3947 {
3948         struct mem_cgroup_event *event =
3949                 container_of(work, struct mem_cgroup_event, remove);
3950         struct mem_cgroup *memcg = event->memcg;
3951 
3952         remove_wait_queue(event->wqh, &event->wait);
3953 
3954         event->unregister_event(memcg, event->eventfd);
3955 
3956         /* Notify userspace the event is going away. */
3957         eventfd_signal(event->eventfd, 1);
3958 
3959         eventfd_ctx_put(event->eventfd);
3960         kfree(event);
3961         css_put(&memcg->css);
3962 }
3963 
3964 /*
3965  * Gets called on EPOLLHUP on eventfd when user closes it.
3966  *
3967  * Called with wqh->lock held and interrupts disabled.
3968  */
3969 static int memcg_event_wake(wait_queue_entry_t *wait, unsigned mode,
3970                             int sync, void *key)
3971 {
3972         struct mem_cgroup_event *event =
3973                 container_of(wait, struct mem_cgroup_event, wait);
3974         struct mem_cgroup *memcg = event->memcg;
3975         __poll_t flags = key_to_poll(key);
3976 
3977         if (flags & EPOLLHUP) {
3978                 /*
3979                  * If the event has been detached at cgroup removal, we
3980                  * can simply return knowing the other side will cleanup
3981                  * for us.
3982                  *
3983                  * We can't race against event freeing since the other
3984                  * side will require wqh->lock via remove_wait_queue(),
3985                  * which we hold.
3986                  */
3987                 spin_lock(&memcg->event_list_lock);
3988                 if (!list_empty(&event->list)) {
3989                         list_del_init(&event->list);
3990                         /*
3991                          * We are in atomic context, but cgroup_event_remove()
3992                          * may sleep, so we have to call it in workqueue.
3993                          */
3994                         schedule_work(&event->remove);
3995                 }
3996                 spin_unlock(&memcg->event_list_lock);
3997         }
3998 
3999         return 0;
4000 }
4001 
4002 static void memcg_event_ptable_queue_proc(struct file *file,
4003                 wait_queue_head_t *wqh, poll_table *pt)
4004 {
4005         struct mem_cgroup_event *event =
4006                 container_of(pt, struct mem_cgroup_event, pt);
4007 
4008         event->wqh = wqh;
4009         add_wait_queue(wqh, &event->wait);
4010 }
4011 
4012 /*
4013  * DO NOT USE IN NEW FILES.
4014  *
4015  * Parse input and register new cgroup event handler.
4016  *
4017  * Input must be in format '<event_fd> <control_fd> <args>'.
4018  * Interpretation of args is defined by control file implementation.
4019  */
4020 static ssize_t memcg_write_event_control(struct kernfs_open_file *of,
4021                                          char *buf, size_t nbytes, loff_t off)
4022 {
4023         struct cgroup_subsys_state *css = of_css(of);
4024         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4025         struct mem_cgroup_event *event;
4026         struct cgroup_subsys_state *cfile_css;
4027         unsigned int efd, cfd;
4028         struct fd efile;
4029         struct fd cfile;
4030         const char *name;
4031         char *endp;
4032         int ret;
4033 
4034         buf = strstrip(buf);
4035 
4036         efd = simple_strtoul(buf, &endp, 10);
4037         if (*endp != ' ')
4038                 return -EINVAL;
4039         buf = endp + 1;
4040 
4041         cfd = simple_strtoul(buf, &endp, 10);
4042         if ((*endp != ' ') && (*endp != '\0'))
4043                 return -EINVAL;
4044         buf = endp + 1;
4045 
4046         event = kzalloc(sizeof(*event), GFP_KERNEL);
4047         if (!event)
4048                 return -ENOMEM;
4049 
4050         event->memcg = memcg;
4051         INIT_LIST_HEAD(&event->list);
4052         init_poll_funcptr(&event->pt, memcg_event_ptable_queue_proc);
4053         init_waitqueue_func_entry(&event->wait, memcg_event_wake);
4054         INIT_WORK(&event->remove, memcg_event_remove);
4055 
4056         efile = fdget(efd);
4057         if (!efile.file) {
4058                 ret = -EBADF;
4059                 goto out_kfree;
4060         }
4061 
4062         event->eventfd = eventfd_ctx_fileget(efile.file);
4063         if (IS_ERR(event->eventfd)) {
4064                 ret = PTR_ERR(event->eventfd);
4065                 goto out_put_efile;
4066         }
4067 
4068         cfile = fdget(cfd);
4069         if (!cfile.file) {
4070                 ret = -EBADF;
4071                 goto out_put_eventfd;
4072         }
4073 
4074         /* the process need read permission on control file */
4075         /* AV: shouldn't we check that it's been opened for read instead? */
4076         ret = inode_permission(file_inode(cfile.file), MAY_READ);
4077         if (ret < 0)
4078                 goto out_put_cfile;
4079 
4080         /*
4081          * Determine the event callbacks and set them in @event.  This used
4082          * to be done via struct cftype but cgroup core no longer knows
4083          * about these events.  The following is crude but the whole thing
4084          * is for compatibility anyway.
4085          *
4086          * DO NOT ADD NEW FILES.
4087          */
4088         name = cfile.file->f_path.dentry->d_name.name;
4089 
4090         if (!strcmp(name, "memory.usage_in_bytes")) {
4091                 event->register_event = mem_cgroup_usage_register_event;
4092                 event->unregister_event = mem_cgroup_usage_unregister_event;
4093         } else if (!strcmp(name, "memory.oom_control")) {
4094                 event->register_event = mem_cgroup_oom_register_event;
4095                 event->unregister_event = mem_cgroup_oom_unregister_event;
4096         } else if (!strcmp(name, "memory.pressure_level")) {
4097                 event->register_event = vmpressure_register_event;
4098                 event->unregister_event = vmpressure_unregister_event;
4099         } else if (!strcmp(name, "memory.memsw.usage_in_bytes")) {
4100                 event->register_event = memsw_cgroup_usage_register_event;
4101                 event->unregister_event = memsw_cgroup_usage_unregister_event;
4102         } else {
4103                 ret = -EINVAL;
4104                 goto out_put_cfile;
4105         }
4106 
4107         /*
4108          * Verify @cfile should belong to @css.  Also, remaining events are
4109          * automatically removed on cgroup destruction but the removal is
4110          * asynchronous, so take an extra ref on @css.
4111          */
4112         cfile_css = css_tryget_online_from_dir(cfile.file->f_path.dentry->d_parent,
4113                                                &memory_cgrp_subsys);
4114         ret = -EINVAL;
4115         if (IS_ERR(cfile_css))
4116                 goto out_put_cfile;
4117         if (cfile_css != css) {
4118                 css_put(cfile_css);
4119                 goto out_put_cfile;
4120         }
4121 
4122         ret = event->register_event(memcg, event->eventfd, buf);
4123         if (ret)
4124                 goto out_put_css;
4125 
4126         vfs_poll(efile.file, &event->pt);
4127 
4128         spin_lock(&memcg->event_list_lock);
4129         list_add(&event->list, &memcg->event_list);
4130         spin_unlock(&memcg->event_list_lock);
4131 
4132         fdput(cfile);
4133         fdput(efile);
4134 
4135         return nbytes;
4136 
4137 out_put_css:
4138         css_put(css);
4139 out_put_cfile:
4140         fdput(cfile);
4141 out_put_eventfd:
4142         eventfd_ctx_put(event->eventfd);
4143 out_put_efile:
4144         fdput(efile);
4145 out_kfree:
4146         kfree(event);
4147 
4148         return ret;
4149 }
4150 
4151 static struct cftype mem_cgroup_legacy_files[] = {
4152         {
4153                 .name = "usage_in_bytes",
4154                 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE),
4155                 .read_u64 = mem_cgroup_read_u64,
4156         },
4157         {
4158                 .name = "max_usage_in_bytes",
4159                 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE),
4160                 .write = mem_cgroup_reset,
4161                 .read_u64 = mem_cgroup_read_u64,
4162         },
4163         {
4164                 .name = "limit_in_bytes",
4165                 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT),
4166                 .write = mem_cgroup_write,
4167                 .read_u64 = mem_cgroup_read_u64,
4168         },
4169         {
4170                 .name = "soft_limit_in_bytes",
4171                 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT),
4172                 .write = mem_cgroup_write,
4173                 .read_u64 = mem_cgroup_read_u64,
4174         },
4175         {
4176                 .name = "failcnt",
4177                 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT),
4178                 .write = mem_cgroup_reset,
4179                 .read_u64 = mem_cgroup_read_u64,
4180         },
4181         {
4182                 .name = "stat",
4183                 .seq_show = memcg_stat_show,
4184         },
4185         {
4186                 .name = "force_empty",
4187                 .write = mem_cgroup_force_empty_write,
4188         },
4189         {
4190                 .name = "use_hierarchy",
4191                 .write_u64 = mem_cgroup_hierarchy_write,
4192                 .read_u64 = mem_cgroup_hierarchy_read,
4193         },
4194         {
4195                 .name = "cgroup.event_control",         /* XXX: for compat */
4196                 .write = memcg_write_event_control,
4197                 .flags = CFTYPE_NO_PREFIX | CFTYPE_WORLD_WRITABLE,
4198         },
4199         {
4200                 .name = "swappiness",
4201                 .read_u64 = mem_cgroup_swappiness_read,
4202                 .write_u64 = mem_cgroup_swappiness_write,
4203         },
4204         {
4205                 .name = "move_charge_at_immigrate",
4206                 .read_u64 = mem_cgroup_move_charge_read,
4207                 .write_u64 = mem_cgroup_move_charge_write,
4208         },
4209         {
4210                 .name = "oom_control",
4211                 .seq_show = mem_cgroup_oom_control_read,
4212                 .write_u64 = mem_cgroup_oom_control_write,
4213                 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL),
4214         },
4215         {
4216                 .name = "pressure_level",
4217         },
4218 #ifdef CONFIG_NUMA
4219         {
4220                 .name = "numa_stat",
4221                 .seq_show = memcg_numa_stat_show,
4222         },
4223 #endif
4224         {
4225                 .name = "kmem.limit_in_bytes",
4226                 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT),
4227                 .write = mem_cgroup_write,
4228                 .read_u64 = mem_cgroup_read_u64,
4229         },
4230         {
4231                 .name = "kmem.usage_in_bytes",
4232                 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE),
4233                 .read_u64 = mem_cgroup_read_u64,
4234         },
4235         {
4236                 .name = "kmem.failcnt",
4237                 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT),
4238                 .write = mem_cgroup_reset,
4239                 .read_u64 = mem_cgroup_read_u64,
4240         },
4241         {
4242                 .name = "kmem.max_usage_in_bytes",
4243                 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE),
4244                 .write = mem_cgroup_reset,
4245                 .read_u64 = mem_cgroup_read_u64,
4246         },
4247 #if defined(CONFIG_SLAB) || defined(CONFIG_SLUB_DEBUG)
4248         {
4249                 .name = "kmem.slabinfo",
4250                 .seq_start = memcg_slab_start,
4251                 .seq_next = memcg_slab_next,
4252                 .seq_stop = memcg_slab_stop,
4253                 .seq_show = memcg_slab_show,
4254         },
4255 #endif
4256         {
4257                 .name = "kmem.tcp.limit_in_bytes",
4258                 .private = MEMFILE_PRIVATE(_TCP, RES_LIMIT),
4259                 .write = mem_cgroup_write,
4260                 .read_u64 = mem_cgroup_read_u64,
4261         },
4262         {
4263                 .name = "kmem.tcp.usage_in_bytes",
4264                 .private = MEMFILE_PRIVATE(_TCP, RES_USAGE),
4265                 .read_u64 = mem_cgroup_read_u64,
4266         },
4267         {
4268                 .name = "kmem.tcp.failcnt",
4269                 .private = MEMFILE_PRIVATE(_TCP, RES_FAILCNT),
4270                 .write = mem_cgroup_reset,
4271                 .read_u64 = mem_cgroup_read_u64,
4272         },
4273         {
4274                 .name = "kmem.tcp.max_usage_in_bytes",
4275                 .private = MEMFILE_PRIVATE(_TCP, RES_MAX_USAGE),
4276                 .write = mem_cgroup_reset,
4277                 .read_u64 = mem_cgroup_read_u64,
4278         },
4279         { },    /* terminate */
4280 };
4281 
4282 /*
4283  * Private memory cgroup IDR
4284  *
4285  * Swap-out records and page cache shadow entries need to store memcg
4286  * references in constrained space, so we maintain an ID space that is
4287  * limited to 16 bit (MEM_CGROUP_ID_MAX), limiting the total number of
4288  * memory-controlled cgroups to 64k.
4289  *
4290  * However, there usually are many references to the oflline CSS after
4291  * the cgroup has been destroyed, such as page cache or reclaimable
4292  * slab objects, that don't need to hang on to the ID. We want to keep
4293  * those dead CSS from occupying IDs, or we might quickly exhaust the
4294  * relatively small ID space and prevent the creation of new cgroups
4295  * even when there are much fewer than 64k cgroups - possibly none.
4296  *
4297  * Maintain a private 16-bit ID space for memcg, and allow the ID to
4298  * be freed and recycled when it's no longer needed, which is usually
4299  * when the CSS is offlined.
4300  *
4301  * The only exception to that are records of swapped out tmpfs/shmem
4302  * pages that need to be attributed to live ancestors on swapin. But
4303  * those references are manageable from userspace.
4304  */
4305 
4306 static DEFINE_IDR(mem_cgroup_idr);
4307 
4308 static void mem_cgroup_id_remove(struct mem_cgroup *memcg)
4309 {
4310         if (memcg->id.id > 0) {
4311                 idr_remove(&mem_cgroup_idr, memcg->id.id);
4312                 memcg->id.id = 0;
4313         }
4314 }
4315 
4316 static void mem_cgroup_id_get_many(struct mem_cgroup *memcg, unsigned int n)
4317 {
4318         refcount_add(n, &memcg->id.ref);
4319 }
4320 
4321 static void mem_cgroup_id_put_many(struct mem_cgroup *memcg, unsigned int n)
4322 {
4323         if (refcount_sub_and_test(n, &memcg->id.ref)) {
4324                 mem_cgroup_id_remove(memcg);
4325 
4326                 /* Memcg ID pins CSS */
4327                 css_put(&memcg->css);
4328         }
4329 }
4330 
4331 static inline void mem_cgroup_id_get(struct mem_cgroup *memcg)
4332 {
4333         mem_cgroup_id_get_many(memcg, 1);
4334 }
4335 
4336 static inline void mem_cgroup_id_put(struct mem_cgroup *memcg)
4337 {
4338         mem_cgroup_id_put_many(memcg, 1);
4339 }
4340 
4341 /**
4342  * mem_cgroup_from_id - look up a memcg from a memcg id
4343  * @id: the memcg id to look up
4344  *
4345  * Caller must hold rcu_read_lock().
4346  */
4347 struct mem_cgroup *mem_cgroup_from_id(unsigned short id)
4348 {
4349         WARN_ON_ONCE(!rcu_read_lock_held());
4350         return idr_find(&mem_cgroup_idr, id);
4351 }
4352 
4353 static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4354 {
4355         struct mem_cgroup_per_node *pn;
4356         int tmp = node;
4357         /*
4358          * This routine is called against possible nodes.
4359          * But it's BUG to call kmalloc() against offline node.
4360          *
4361          * TODO: this routine can waste much memory for nodes which will
4362          *       never be onlined. It's better to use memory hotplug callback
4363          *       function.
4364          */
4365         if (!node_state(node, N_NORMAL_MEMORY))
4366                 tmp = -1;
4367         pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp);
4368         if (!pn)
4369                 return 1;
4370 
4371         pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat);
4372         if (!pn->lruvec_stat_cpu) {
4373                 kfree(pn);
4374                 return 1;
4375         }
4376 
4377         lruvec_init(&pn->lruvec);
4378         pn->usage_in_excess = 0;
4379         pn->on_tree = false;
4380         pn->memcg = memcg;
4381 
4382         memcg->nodeinfo[node] = pn;
4383         return 0;
4384 }
4385 
4386 static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node)
4387 {
4388         struct mem_cgroup_per_node *pn = memcg->nodeinfo[node];
4389 
4390         if (!pn)
4391                 return;
4392 
4393         free_percpu(pn->lruvec_stat_cpu);
4394         kfree(pn);
4395 }
4396 
4397 static void __mem_cgroup_free(struct mem_cgroup *memcg)
4398 {
4399         int node;
4400 
4401         for_each_node(node)
4402                 free_mem_cgroup_per_node_info(memcg, node);
4403         free_percpu(memcg->stat_cpu);
4404         kfree(memcg);
4405 }
4406 
4407 static void mem_cgroup_free(struct mem_cgroup *memcg)
4408 {
4409         memcg_wb_domain_exit(memcg);
4410         __mem_cgroup_free(memcg);
4411 }
4412 
4413 static struct mem_cgroup *mem_cgroup_alloc(void)
4414 {
4415         struct mem_cgroup *memcg;
4416         size_t size;
4417         int node;
4418 
4419         size = sizeof(struct mem_cgroup);
4420         size += nr_node_ids * sizeof(struct mem_cgroup_per_node *);
4421 
4422         memcg = kzalloc(size, GFP_KERNEL);
4423         if (!memcg)
4424                 return NULL;
4425 
4426         memcg->id.id = idr_alloc(&mem_cgroup_idr, NULL,
4427                                  1, MEM_CGROUP_ID_MAX,
4428                                  GFP_KERNEL);
4429         if (memcg->id.id < 0)
4430                 goto fail;
4431 
4432         memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu);
4433         if (!memcg->stat_cpu)
4434                 goto fail;
4435 
4436         for_each_node(node)
4437                 if (alloc_mem_cgroup_per_node_info(memcg, node))
4438                         goto fail;
4439 
4440         if (memcg_wb_domain_init(memcg, GFP_KERNEL))
4441                 goto fail;
4442 
4443         INIT_WORK(&memcg->high_work, high_work_func);
4444         memcg->last_scanned_node = MAX_NUMNODES;
4445         INIT_LIST_HEAD(&memcg->oom_notify);
4446         mutex_init(&memcg->thresholds_lock);
4447         spin_lock_init(&memcg->move_lock);
4448         vmpressure_init(&memcg->vmpressure);
4449         INIT_LIST_HEAD(&memcg->event_list);
4450         spin_lock_init(&memcg->event_list_lock);
4451         memcg->socket_pressure = jiffies;
4452 #ifdef CONFIG_MEMCG_KMEM
4453         memcg->kmemcg_id = -1;
4454 #endif
4455 #ifdef CONFIG_CGROUP_WRITEBACK
4456         INIT_LIST_HEAD(&memcg->cgwb_list);
4457 #endif
4458         idr_replace(&mem_cgroup_idr, memcg, memcg->id.id);
4459         return memcg;
4460 fail:
4461         mem_cgroup_id_remove(memcg);
4462         __mem_cgroup_free(memcg);
4463         return NULL;
4464 }
4465 
4466 static struct cgroup_subsys_state * __ref
4467 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css)
4468 {
4469         struct mem_cgroup *parent = mem_cgroup_from_css(parent_css);
4470         struct mem_cgroup *memcg;
4471         long error = -ENOMEM;
4472 
4473         memcg = mem_cgroup_alloc();
4474         if (!memcg)
4475                 return ERR_PTR(error);
4476 
4477         memcg->high = PAGE_COUNTER_MAX;
4478         memcg->soft_limit = PAGE_COUNTER_MAX;
4479         if (parent) {
4480                 memcg->swappiness = mem_cgroup_swappiness(parent);
4481                 memcg->oom_kill_disable = parent->oom_kill_disable;
4482         }
4483         if (parent && parent->use_hierarchy) {
4484                 memcg->use_hierarchy = true;
4485                 page_counter_init(&memcg->memory, &parent->memory);
4486                 page_counter_init(&memcg->swap, &parent->swap);
4487                 page_counter_init(&memcg->memsw, &parent->memsw);
4488                 page_counter_init(&memcg->kmem, &parent->kmem);
4489                 page_counter_init(&memcg->tcpmem, &parent->tcpmem);
4490         } else {
4491                 page_counter_init(&memcg->memory, NULL);
4492                 page_counter_init(&memcg->swap, NULL);
4493                 page_counter_init(&memcg->memsw, NULL);
4494                 page_counter_init(&memcg->kmem, NULL);
4495                 page_counter_init(&memcg->tcpmem, NULL);
4496                 /*
4497                  * Deeper hierachy with use_hierarchy == false doesn't make
4498                  * much sense so let cgroup subsystem know about this
4499                  * unfortunate state in our controller.
4500                  */
4501                 if (parent != root_mem_cgroup)
4502                         memory_cgrp_subsys.broken_hierarchy = true;
4503         }
4504 
4505         /* The following stuff does not apply to the root */
4506         if (!parent) {
4507                 root_mem_cgroup = memcg;
4508                 return &memcg->css;
4509         }
4510 
4511         error = memcg_online_kmem(memcg);
4512         if (error)
4513                 goto fail;
4514 
4515         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4516                 static_branch_inc(&memcg_sockets_enabled_key);
4517 
4518         return &memcg->css;
4519 fail:
4520         mem_cgroup_id_remove(memcg);
4521         mem_cgroup_free(memcg);
4522         return ERR_PTR(-ENOMEM);
4523 }
4524 
4525 static int mem_cgroup_css_online(struct cgroup_subsys_state *css)
4526 {
4527         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4528 
4529         /*
4530          * A memcg must be visible for memcg_expand_shrinker_maps()
4531          * by the time the maps are allocated. So, we allocate maps
4532          * here, when for_each_mem_cgroup() can't skip it.
4533          */
4534         if (memcg_alloc_shrinker_maps(memcg)) {
4535                 mem_cgroup_id_remove(memcg);
4536                 return -ENOMEM;
4537         }
4538 
4539         /* Online state pins memcg ID, memcg ID pins CSS */
4540         refcount_set(&memcg->id.ref, 1);
4541         css_get(css);
4542         return 0;
4543 }
4544 
4545 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css)
4546 {
4547         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4548         struct mem_cgroup_event *event, *tmp;
4549 
4550         /*
4551          * Unregister events and notify userspace.
4552          * Notify userspace about cgroup removing only after rmdir of cgroup
4553          * directory to avoid race between userspace and kernelspace.
4554          */
4555         spin_lock(&memcg->event_list_lock);
4556         list_for_each_entry_safe(event, tmp, &memcg->event_list, list) {
4557                 list_del_init(&event->list);
4558                 schedule_work(&event->remove);
4559         }
4560         spin_unlock(&memcg->event_list_lock);
4561 
4562         page_counter_set_min(&memcg->memory, 0);
4563         page_counter_set_low(&memcg->memory, 0);
4564 
4565         memcg_offline_kmem(memcg);
4566         wb_memcg_offline(memcg);
4567 
4568         drain_all_stock(memcg);
4569 
4570         mem_cgroup_id_put(memcg);
4571 }
4572 
4573 static void mem_cgroup_css_released(struct cgroup_subsys_state *css)
4574 {
4575         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4576 
4577         invalidate_reclaim_iterators(memcg);
4578 }
4579 
4580 static void mem_cgroup_css_free(struct cgroup_subsys_state *css)
4581 {
4582         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4583 
4584         if (cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_nosocket)
4585                 static_branch_dec(&memcg_sockets_enabled_key);
4586 
4587         if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && memcg->tcpmem_active)
4588                 static_branch_dec(&memcg_sockets_enabled_key);
4589 
4590         vmpressure_cleanup(&memcg->vmpressure);
4591         cancel_work_sync(&memcg->high_work);
4592         mem_cgroup_remove_from_trees(memcg);
4593         memcg_free_shrinker_maps(memcg);
4594         memcg_free_kmem(memcg);
4595         mem_cgroup_free(memcg);
4596 }
4597 
4598 /**
4599  * mem_cgroup_css_reset - reset the states of a mem_cgroup
4600  * @css: the target css
4601  *
4602  * Reset the states of the mem_cgroup associated with @css.  This is
4603  * invoked when the userland requests disabling on the default hierarchy
4604  * but the memcg is pinned through dependency.  The memcg should stop
4605  * applying policies and should revert to the vanilla state as it may be
4606  * made visible again.
4607  *
4608  * The current implementation only resets the essential configurations.
4609  * This needs to be expanded to cover all the visible parts.
4610  */
4611 static void mem_cgroup_css_reset(struct cgroup_subsys_state *css)
4612 {
4613         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
4614 
4615         page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX);
4616         page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX);
4617         page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX);
4618         page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX);
4619         page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX);
4620         page_counter_set_min(&memcg->memory, 0);
4621         page_counter_set_low(&memcg->memory, 0);
4622         memcg->high = PAGE_COUNTER_MAX;
4623         memcg->soft_limit = PAGE_COUNTER_MAX;
4624         memcg_wb_domain_size_changed(memcg);
4625 }
4626 
4627 #ifdef CONFIG_MMU
4628 /* Handlers for move charge at task migration. */
4629 static int mem_cgroup_do_precharge(unsigned long count)
4630 {
4631         int ret;
4632 
4633         /* Try a single bulk charge without reclaim first, kswapd may wake */
4634         ret = try_charge(mc.to, GFP_KERNEL & ~__GFP_DIRECT_RECLAIM, count);
4635         if (!ret) {
4636                 mc.precharge += count;
4637                 return ret;
4638         }
4639 
4640         /* Try charges one by one with reclaim, but do not retry */
4641         while (count--) {
4642                 ret = try_charge(mc.to, GFP_KERNEL | __GFP_NORETRY, 1);
4643                 if (ret)
4644                         return ret;
4645                 mc.precharge++;
4646                 cond_resched();
4647         }
4648         return 0;
4649 }
4650 
4651 union mc_target {
4652         struct page     *page;
4653         swp_entry_t     ent;
4654 };
4655 
4656 enum mc_target_type {
4657         MC_TARGET_NONE = 0,
4658         MC_TARGET_PAGE,
4659         MC_TARGET_SWAP,
4660         MC_TARGET_DEVICE,
4661 };
4662 
4663 static struct page *mc_handle_present_pte(struct vm_area_struct *vma,
4664                                                 unsigned long addr, pte_t ptent)
4665 {
4666         struct page *page = _vm_normal_page(vma, addr, ptent, true);
4667 
4668         if (!page || !page_mapped(page))
4669                 return NULL;
4670         if (PageAnon(page)) {
4671                 if (!(mc.flags & MOVE_ANON))
4672                         return NULL;
4673         } else {
4674                 if (!(mc.flags & MOVE_FILE))
4675                         return NULL;
4676         }
4677         if (!get_page_unless_zero(page))
4678                 return NULL;
4679 
4680         return page;
4681 }
4682 
4683 #if defined(CONFIG_SWAP) || defined(CONFIG_DEVICE_PRIVATE)
4684 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4685                         pte_t ptent, swp_entry_t *entry)
4686 {
4687         struct page *page = NULL;
4688         swp_entry_t ent = pte_to_swp_entry(ptent);
4689 
4690         if (!(mc.flags & MOVE_ANON) || non_swap_entry(ent))
4691                 return NULL;
4692 
4693         /*
4694          * Handle MEMORY_DEVICE_PRIVATE which are ZONE_DEVICE page belonging to
4695          * a device and because they are not accessible by CPU they are store
4696          * as special swap entry in the CPU page table.
4697          */
4698         if (is_device_private_entry(ent)) {
4699                 page = device_private_entry_to_page(ent);
4700                 /*
4701                  * MEMORY_DEVICE_PRIVATE means ZONE_DEVICE page and which have
4702                  * a refcount of 1 when free (unlike normal page)
4703                  */
4704                 if (!page_ref_add_unless(page, 1, 1))
4705                         return NULL;
4706                 return page;
4707         }
4708 
4709         /*
4710          * Because lookup_swap_cache() updates some statistics counter,
4711          * we call find_get_page() with swapper_space directly.
4712          */
4713         page = find_get_page(swap_address_space(ent), swp_offset(ent));
4714         if (do_memsw_account())
4715                 entry->val = ent.val;
4716 
4717         return page;
4718 }
4719 #else
4720 static struct page *mc_handle_swap_pte(struct vm_area_struct *vma,
4721                         pte_t ptent, swp_entry_t *entry)
4722 {
4723         return NULL;
4724 }
4725 #endif
4726 
4727 static struct page *mc_handle_file_pte(struct vm_area_struct *vma,
4728                         unsigned long addr, pte_t ptent, swp_entry_t *entry)
4729 {
4730         struct page *page = NULL;
4731         struct address_space *mapping;
4732         pgoff_t pgoff;
4733 
4734         if (!vma->vm_file) /* anonymous vma */
4735                 return NULL;
4736         if (!(mc.flags & MOVE_FILE))
4737                 return NULL;
4738 
4739         mapping = vma->vm_file->f_mapping;
4740         pgoff = linear_page_index(vma, addr);
4741 
4742         /* page is moved even if it's not RSS of this task(page-faulted). */
4743 #ifdef CONFIG_SWAP
4744         /* shmem/tmpfs may report page out on swap: account for that too. */
4745         if (shmem_mapping(mapping)) {
4746                 page = find_get_entry(mapping, pgoff);
4747                 if (xa_is_value(page)) {
4748                         swp_entry_t swp = radix_to_swp_entry(page);
4749                         if (do_memsw_account())
4750                                 *entry = swp;
4751                         page = find_get_page(swap_address_space(swp),
4752                                              swp_offset(swp));
4753                 }
4754         } else
4755                 page = find_get_page(mapping, pgoff);
4756 #else
4757         page = find_get_page(mapping, pgoff);
4758 #endif
4759         return page;
4760 }
4761 
4762 /**
4763  * mem_cgroup_move_account - move account of the page
4764  * @page: the page
4765  * @compound: charge the page as compound or small page
4766  * @from: mem_cgroup which the page is moved from.
4767  * @to: mem_cgroup which the page is moved to. @from != @to.
4768  *
4769  * The caller must make sure the page is not on LRU (isolate_page() is useful.)
4770  *
4771  * This function doesn't do "charge" to new cgroup and doesn't do "uncharge"
4772  * from old cgroup.
4773  */
4774 static int mem_cgroup_move_account(struct page *page,
4775                                    bool compound,
4776                                    struct mem_cgroup *from,
4777                                    struct mem_cgroup *to)
4778 {
4779         unsigned long flags;
4780         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
4781         int ret;
4782         bool anon;
4783 
4784         VM_BUG_ON(from == to);
4785         VM_BUG_ON_PAGE(PageLRU(page), page);
4786         VM_BUG_ON(compound && !PageTransHuge(page));
4787 
4788         /*
4789          * Prevent mem_cgroup_migrate() from looking at
4790          * page->mem_cgroup of its source page while we change it.
4791          */
4792         ret = -EBUSY;
4793         if (!trylock_page(page))
4794                 goto out;
4795 
4796         ret = -EINVAL;
4797         if (page->mem_cgroup != from)
4798                 goto out_unlock;
4799 
4800         anon = PageAnon(page);
4801 
4802         spin_lock_irqsave(&from->move_lock, flags);
4803 
4804         if (!anon && page_mapped(page)) {
4805                 __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages);
4806                 __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages);
4807         }
4808 
4809         /*
4810          * move_lock grabbed above and caller set from->moving_account, so
4811          * mod_memcg_page_state will serialize updates to PageDirty.
4812          * So mapping should be stable for dirty pages.
4813          */
4814         if (!anon && PageDirty(page)) {
4815                 struct address_space *mapping = page_mapping(page);
4816 
4817                 if (mapping_cap_account_dirty(mapping)) {
4818                         __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages);
4819                         __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages);
4820                 }
4821         }
4822 
4823         if (PageWriteback(page)) {
4824                 __mod_memcg_state(from, NR_WRITEBACK, -nr_pages);
4825                 __mod_memcg_state(to, NR_WRITEBACK, nr_pages);
4826         }
4827 
4828         /*
4829          * It is safe to change page->mem_cgroup here because the page
4830          * is referenced, charged, and isolated - we can't race with
4831          * uncharging, charging, migration, or LRU putback.
4832          */
4833 
4834         /* caller should have done css_get */
4835         page->mem_cgroup = to;
4836         spin_unlock_irqrestore(&from->move_lock, flags);
4837 
4838         ret = 0;
4839 
4840         local_irq_disable();
4841         mem_cgroup_charge_statistics(to, page, compound, nr_pages);
4842         memcg_check_events(to, page);
4843         mem_cgroup_charge_statistics(from, page, compound, -nr_pages);
4844         memcg_check_events(from, page);
4845         local_irq_enable();
4846 out_unlock:
4847         unlock_page(page);
4848 out:
4849         return ret;
4850 }
4851 
4852 /**
4853  * get_mctgt_type - get target type of moving charge
4854  * @vma: the vma the pte to be checked belongs
4855  * @addr: the address corresponding to the pte to be checked
4856  * @ptent: the pte to be checked
4857  * @target: the pointer the target page or swap ent will be stored(can be NULL)
4858  *
4859  * Returns
4860  *   0(MC_TARGET_NONE): if the pte is not a target for move charge.
4861  *   1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for
4862  *     move charge. if @target is not NULL, the page is stored in target->page
4863  *     with extra refcnt got(Callers should handle it).
4864  *   2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a
4865  *     target for charge migration. if @target is not NULL, the entry is stored
4866  *     in target->ent.
4867  *   3(MC_TARGET_DEVICE): like MC_TARGET_PAGE  but page is MEMORY_DEVICE_PUBLIC
4868  *     or MEMORY_DEVICE_PRIVATE (so ZONE_DEVICE page and thus not on the lru).
4869  *     For now we such page is charge like a regular page would be as for all
4870  *     intent and purposes it is just special memory taking the place of a
4871  *     regular page.
4872  *
4873  *     See Documentations/vm/hmm.txt and include/linux/hmm.h
4874  *
4875  * Called with pte lock held.
4876  */
4877 
4878 static enum mc_target_type get_mctgt_type(struct vm_area_struct *vma,
4879                 unsigned long addr, pte_t ptent, union mc_target *target)
4880 {
4881         struct page *page = NULL;
4882         enum mc_target_type ret = MC_TARGET_NONE;
4883         swp_entry_t ent = { .val = 0 };
4884 
4885         if (pte_present(ptent))
4886                 page = mc_handle_present_pte(vma, addr, ptent);
4887         else if (is_swap_pte(ptent))
4888                 page = mc_handle_swap_pte(vma, ptent, &ent);
4889         else if (pte_none(ptent))
4890                 page = mc_handle_file_pte(vma, addr, ptent, &ent);
4891 
4892         if (!page && !ent.val)
4893                 return ret;
4894         if (page) {
4895                 /*
4896                  * Do only loose check w/o serialization.
4897                  * mem_cgroup_move_account() checks the page is valid or
4898                  * not under LRU exclusion.
4899                  */
4900                 if (page->mem_cgroup == mc.from) {
4901                         ret = MC_TARGET_PAGE;
4902                         if (is_device_private_page(page) ||
4903                             is_device_public_page(page))
4904                                 ret = MC_TARGET_DEVICE;
4905                         if (target)
4906                                 target->page = page;
4907                 }
4908                 if (!ret || !target)
4909                         put_page(page);
4910         }
4911         /*
4912          * There is a swap entry and a page doesn't exist or isn't charged.
4913          * But we cannot move a tail-page in a THP.
4914          */
4915         if (ent.val && !ret && (!page || !PageTransCompound(page)) &&
4916             mem_cgroup_id(mc.from) == lookup_swap_cgroup_id(ent)) {
4917                 ret = MC_TARGET_SWAP;
4918                 if (target)
4919                         target->ent = ent;
4920         }
4921         return ret;
4922 }
4923 
4924 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
4925 /*
4926  * We don't consider PMD mapped swapping or file mapped pages because THP does
4927  * not support them for now.
4928  * Caller should make sure that pmd_trans_huge(pmd) is true.
4929  */
4930 static enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4931                 unsigned long addr, pmd_t pmd, union mc_target *target)
4932 {
4933         struct page *page = NULL;
4934         enum mc_target_type ret = MC_TARGET_NONE;
4935 
4936         if (unlikely(is_swap_pmd(pmd))) {
4937                 VM_BUG_ON(thp_migration_supported() &&
4938                                   !is_pmd_migration_entry(pmd));
4939                 return ret;
4940         }
4941         page = pmd_page(pmd);
4942         VM_BUG_ON_PAGE(!page || !PageHead(page), page);
4943         if (!(mc.flags & MOVE_ANON))
4944                 return ret;
4945         if (page->mem_cgroup == mc.from) {
4946                 ret = MC_TARGET_PAGE;
4947                 if (target) {
4948                         get_page(page);
4949                         target->page = page;
4950                 }
4951         }
4952         return ret;
4953 }
4954 #else
4955 static inline enum mc_target_type get_mctgt_type_thp(struct vm_area_struct *vma,
4956                 unsigned long addr, pmd_t pmd, union mc_target *target)
4957 {
4958         return MC_TARGET_NONE;
4959 }
4960 #endif
4961 
4962 static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd,
4963                                         unsigned long addr, unsigned long end,
4964                                         struct mm_walk *walk)
4965 {
4966         struct vm_area_struct *vma = walk->vma;
4967         pte_t *pte;
4968         spinlock_t *ptl;
4969 
4970         ptl = pmd_trans_huge_lock(pmd, vma);
4971         if (ptl) {
4972                 /*
4973                  * Note their can not be MC_TARGET_DEVICE for now as we do not
4974                  * support transparent huge page with MEMORY_DEVICE_PUBLIC or
4975                  * MEMORY_DEVICE_PRIVATE but this might change.
4976                  */
4977                 if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE)
4978                         mc.precharge += HPAGE_PMD_NR;
4979                 spin_unlock(ptl);
4980                 return 0;
4981         }
4982 
4983         if (pmd_trans_unstable(pmd))
4984                 return 0;
4985         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
4986         for (; addr != end; pte++, addr += PAGE_SIZE)
4987                 if (get_mctgt_type(vma, addr, *pte, NULL))
4988                         mc.precharge++; /* increment precharge temporarily */
4989         pte_unmap_unlock(pte - 1, ptl);
4990         cond_resched();
4991 
4992         return 0;
4993 }
4994 
4995 static unsigned long mem_cgroup_count_precharge(struct mm_struct *mm)
4996 {
4997         unsigned long precharge;
4998 
4999         struct mm_walk mem_cgroup_count_precharge_walk = {
5000                 .pmd_entry = mem_cgroup_count_precharge_pte_range,
5001                 .mm = mm,
5002         };
5003         down_read(&mm->mmap_sem);
5004         walk_page_range(0, mm->highest_vm_end,
5005                         &mem_cgroup_count_precharge_walk);
5006         up_read(&mm->mmap_sem);
5007 
5008         precharge = mc.precharge;
5009         mc.precharge = 0;
5010 
5011         return precharge;
5012 }
5013 
5014 static int mem_cgroup_precharge_mc(struct mm_struct *mm)
5015 {
5016         unsigned long precharge = mem_cgroup_count_precharge(mm);
5017 
5018         VM_BUG_ON(mc.moving_task);
5019         mc.moving_task = current;
5020         return mem_cgroup_do_precharge(precharge);
5021 }
5022 
5023 /* cancels all extra charges on mc.from and mc.to, and wakes up all waiters. */
5024 static void __mem_cgroup_clear_mc(void)
5025 {
5026         struct mem_cgroup *from = mc.from;
5027         struct mem_cgroup *to = mc.to;
5028 
5029         /* we must uncharge all the leftover precharges from mc.to */
5030         if (mc.precharge) {
5031                 cancel_charge(mc.to, mc.precharge);
5032                 mc.precharge = 0;
5033         }
5034         /*
5035          * we didn't uncharge from mc.from at mem_cgroup_move_account(), so
5036          * we must uncharge here.
5037          */
5038         if (mc.moved_charge) {
5039                 cancel_charge(mc.from, mc.moved_charge);
5040                 mc.moved_charge = 0;
5041         }
5042         /* we must fixup refcnts and charges */
5043         if (mc.moved_swap) {
5044                 /* uncharge swap account from the old cgroup */
5045                 if (!mem_cgroup_is_root(mc.from))
5046                         page_counter_uncharge(&mc.from->memsw, mc.moved_swap);
5047 
5048                 mem_cgroup_id_put_many(mc.from, mc.moved_swap);
5049 
5050                 /*
5051                  * we charged both to->memory and to->memsw, so we
5052                  * should uncharge to->memory.
5053                  */
5054                 if (!mem_cgroup_is_root(mc.to))
5055                         page_counter_uncharge(&mc.to->memory, mc.moved_swap);
5056 
5057                 mem_cgroup_id_get_many(mc.to, mc.moved_swap);
5058                 css_put_many(&mc.to->css, mc.moved_swap);
5059 
5060                 mc.moved_swap = 0;
5061         }
5062         memcg_oom_recover(from);
5063         memcg_oom_recover(to);
5064         wake_up_all(&mc.waitq);
5065 }
5066 
5067 static void mem_cgroup_clear_mc(void)
5068 {
5069         struct mm_struct *mm = mc.mm;
5070 
5071         /*
5072          * we must clear moving_task before waking up waiters at the end of
5073          * task migration.
5074          */
5075         mc.moving_task = NULL;
5076         __mem_cgroup_clear_mc();
5077         spin_lock(&mc.lock);
5078         mc.from = NULL;
5079         mc.to = NULL;
5080         mc.mm = NULL;
5081         spin_unlock(&mc.lock);
5082 
5083         mmput(mm);
5084 }
5085 
5086 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5087 {
5088         struct cgroup_subsys_state *css;
5089         struct mem_cgroup *memcg = NULL; /* unneeded init to make gcc happy */
5090         struct mem_cgroup *from;
5091         struct task_struct *leader, *p;
5092         struct mm_struct *mm;
5093         unsigned long move_flags;
5094         int ret = 0;
5095 
5096         /* charge immigration isn't supported on the default hierarchy */
5097         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5098                 return 0;
5099 
5100         /*
5101          * Multi-process migrations only happen on the default hierarchy
5102          * where charge immigration is not used.  Perform charge
5103          * immigration if @tset contains a leader and whine if there are
5104          * multiple.
5105          */
5106         p = NULL;
5107         cgroup_taskset_for_each_leader(leader, css, tset) {
5108                 WARN_ON_ONCE(p);
5109                 p = leader;
5110                 memcg = mem_cgroup_from_css(css);
5111         }
5112         if (!p)
5113                 return 0;
5114 
5115         /*
5116          * We are now commited to this value whatever it is. Changes in this
5117          * tunable will only affect upcoming migrations, not the current one.
5118          * So we need to save it, and keep it going.
5119          */
5120         move_flags = READ_ONCE(memcg->move_charge_at_immigrate);
5121         if (!move_flags)
5122                 return 0;
5123 
5124         from = mem_cgroup_from_task(p);
5125 
5126         VM_BUG_ON(from == memcg);
5127 
5128         mm = get_task_mm(p);
5129         if (!mm)
5130                 return 0;
5131         /* We move charges only when we move a owner of the mm */
5132         if (mm->owner == p) {
5133                 VM_BUG_ON(mc.from);
5134                 VM_BUG_ON(mc.to);
5135                 VM_BUG_ON(mc.precharge);
5136                 VM_BUG_ON(mc.moved_charge);
5137                 VM_BUG_ON(mc.moved_swap);
5138 
5139                 spin_lock(&mc.lock);
5140                 mc.mm = mm;
5141                 mc.from = from;
5142                 mc.to = memcg;
5143                 mc.flags = move_flags;
5144                 spin_unlock(&mc.lock);
5145                 /* We set mc.moving_task later */
5146 
5147                 ret = mem_cgroup_precharge_mc(mm);
5148                 if (ret)
5149                         mem_cgroup_clear_mc();
5150         } else {
5151                 mmput(mm);
5152         }
5153         return ret;
5154 }
5155 
5156 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5157 {
5158         if (mc.to)
5159                 mem_cgroup_clear_mc();
5160 }
5161 
5162 static int mem_cgroup_move_charge_pte_range(pmd_t *pmd,
5163                                 unsigned long addr, unsigned long end,
5164                                 struct mm_walk *walk)
5165 {
5166         int ret = 0;
5167         struct vm_area_struct *vma = walk->vma;
5168         pte_t *pte;
5169         spinlock_t *ptl;
5170         enum mc_target_type target_type;
5171         union mc_target target;
5172         struct page *page;
5173 
5174         ptl = pmd_trans_huge_lock(pmd, vma);
5175         if (ptl) {
5176                 if (mc.precharge < HPAGE_PMD_NR) {
5177                         spin_unlock(ptl);
5178                         return 0;
5179                 }
5180                 target_type = get_mctgt_type_thp(vma, addr, *pmd, &target);
5181                 if (target_type == MC_TARGET_PAGE) {
5182                         page = target.page;
5183                         if (!isolate_lru_page(page)) {
5184                                 if (!mem_cgroup_move_account(page, true,
5185                                                              mc.from, mc.to)) {
5186                                         mc.precharge -= HPAGE_PMD_NR;
5187                                         mc.moved_charge += HPAGE_PMD_NR;
5188                                 }
5189                                 putback_lru_page(page);
5190                         }
5191                         put_page(page);
5192                 } else if (target_type == MC_TARGET_DEVICE) {
5193                         page = target.page;
5194                         if (!mem_cgroup_move_account(page, true,
5195                                                      mc.from, mc.to)) {
5196                                 mc.precharge -= HPAGE_PMD_NR;
5197                                 mc.moved_charge += HPAGE_PMD_NR;
5198                         }
5199                         put_page(page);
5200                 }
5201                 spin_unlock(ptl);
5202                 return 0;
5203         }
5204 
5205         if (pmd_trans_unstable(pmd))
5206                 return 0;
5207 retry:
5208         pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, &ptl);
5209         for (; addr != end; addr += PAGE_SIZE) {
5210                 pte_t ptent = *(pte++);
5211                 bool device = false;
5212                 swp_entry_t ent;
5213 
5214                 if (!mc.precharge)
5215                         break;
5216 
5217                 switch (get_mctgt_type(vma, addr, ptent, &target)) {
5218                 case MC_TARGET_DEVICE:
5219                         device = true;
5220                         /* fall through */
5221                 case MC_TARGET_PAGE:
5222                         page = target.page;
5223                         /*
5224                          * We can have a part of the split pmd here. Moving it
5225                          * can be done but it would be too convoluted so simply
5226                          * ignore such a partial THP and keep it in original
5227                          * memcg. There should be somebody mapping the head.
5228                          */
5229                         if (PageTransCompound(page))
5230                                 goto put;
5231                         if (!device && isolate_lru_page(page))
5232                                 goto put;
5233                         if (!mem_cgroup_move_account(page, false,
5234                                                 mc.from, mc.to)) {
5235                                 mc.precharge--;
5236                                 /* we uncharge from mc.from later. */
5237                                 mc.moved_charge++;
5238                         }
5239                         if (!device)
5240                                 putback_lru_page(page);
5241 put:                    /* get_mctgt_type() gets the page */
5242                         put_page(page);
5243                         break;
5244                 case MC_TARGET_SWAP:
5245                         ent = target.ent;
5246                         if (!mem_cgroup_move_swap_account(ent, mc.from, mc.to)) {
5247                                 mc.precharge--;
5248                                 /* we fixup refcnts and charges later. */
5249                                 mc.moved_swap++;
5250                         }
5251                         break;
5252                 default:
5253                         break;
5254                 }
5255         }
5256         pte_unmap_unlock(pte - 1, ptl);
5257         cond_resched();
5258 
5259         if (addr != end) {
5260                 /*
5261                  * We have consumed all precharges we got in can_attach().
5262                  * We try charge one by one, but don't do any additional
5263                  * charges to mc.to if we have failed in charge once in attach()
5264                  * phase.
5265                  */
5266                 ret = mem_cgroup_do_precharge(1);
5267                 if (!ret)
5268                         goto retry;
5269         }
5270 
5271         return ret;
5272 }
5273 
5274 static void mem_cgroup_move_charge(void)
5275 {
5276         struct mm_walk mem_cgroup_move_charge_walk = {
5277                 .pmd_entry = mem_cgroup_move_charge_pte_range,
5278                 .mm = mc.mm,
5279         };
5280 
5281         lru_add_drain_all();
5282         /*
5283          * Signal lock_page_memcg() to take the memcg's move_lock
5284          * while we're moving its pages to another memcg. Then wait
5285          * for already started RCU-only updates to finish.
5286          */
5287         atomic_inc(&mc.from->moving_account);
5288         synchronize_rcu();
5289 retry:
5290         if (unlikely(!down_read_trylock(&mc.mm->mmap_sem))) {
5291                 /*
5292                  * Someone who are holding the mmap_sem might be waiting in
5293                  * waitq. So we cancel all extra charges, wake up all waiters,
5294                  * and retry. Because we cancel precharges, we might not be able
5295                  * to move enough charges, but moving charge is a best-effort
5296                  * feature anyway, so it wouldn't be a big problem.
5297                  */
5298                 __mem_cgroup_clear_mc();
5299                 cond_resched();
5300                 goto retry;
5301         }
5302         /*
5303          * When we have consumed all precharges and failed in doing
5304          * additional charge, the page walk just aborts.
5305          */
5306         walk_page_range(0, mc.mm->highest_vm_end, &mem_cgroup_move_charge_walk);
5307 
5308         up_read(&mc.mm->mmap_sem);
5309         atomic_dec(&mc.from->moving_account);
5310 }
5311 
5312 static void mem_cgroup_move_task(void)
5313 {
5314         if (mc.to) {
5315                 mem_cgroup_move_charge();
5316                 mem_cgroup_clear_mc();
5317         }
5318 }
5319 #else   /* !CONFIG_MMU */
5320 static int mem_cgroup_can_attach(struct cgroup_taskset *tset)
5321 {
5322         return 0;
5323 }
5324 static void mem_cgroup_cancel_attach(struct cgroup_taskset *tset)
5325 {
5326 }
5327 static void mem_cgroup_move_task(void)
5328 {
5329 }
5330 #endif
5331 
5332 /*
5333  * Cgroup retains root cgroups across [un]mount cycles making it necessary
5334  * to verify whether we're attached to the default hierarchy on each mount
5335  * attempt.
5336  */
5337 static void mem_cgroup_bind(struct cgroup_subsys_state *root_css)
5338 {
5339         /*
5340          * use_hierarchy is forced on the default hierarchy.  cgroup core
5341          * guarantees that @root doesn't have any children, so turning it
5342          * on for the root memcg is enough.
5343          */
5344         if (cgroup_subsys_on_dfl(memory_cgrp_subsys))
5345                 root_mem_cgroup->use_hierarchy = true;
5346         else
5347                 root_mem_cgroup->use_hierarchy = false;
5348 }
5349 
5350 static u64 memory_current_read(struct cgroup_subsys_state *css,
5351                                struct cftype *cft)
5352 {
5353         struct mem_cgroup *memcg = mem_cgroup_from_css(css);
5354 
5355         return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE;
5356 }
5357 
5358 static int memory_min_show(struct seq_file *m, void *v)
5359 {
5360         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5361         unsigned long min = READ_ONCE(memcg->memory.min);
5362 
5363         if (min == PAGE_COUNTER_MAX)
5364                 seq_puts(m, "max\n");
5365         else
5366                 seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE);
5367 
5368         return 0;
5369 }
5370 
5371 static ssize_t memory_min_write(struct kernfs_open_file *of,
5372                                 char *buf, size_t nbytes, loff_t off)
5373 {
5374         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5375         unsigned long min;
5376         int err;
5377 
5378         buf = strstrip(buf);
5379         err = page_counter_memparse(buf, "max", &min);
5380         if (err)
5381                 return err;
5382 
5383         page_counter_set_min(&memcg->memory, min);
5384 
5385         return nbytes;
5386 }
5387 
5388 static int memory_low_show(struct seq_file *m, void *v)
5389 {
5390         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5391         unsigned long low = READ_ONCE(memcg->memory.low);
5392 
5393         if (low == PAGE_COUNTER_MAX)
5394                 seq_puts(m, "max\n");
5395         else
5396                 seq_printf(m, "%llu\n", (u64)low * PAGE_SIZE);
5397 
5398         return 0;
5399 }
5400 
5401 static ssize_t memory_low_write(struct kernfs_open_file *of,
5402                                 char *buf, size_t nbytes, loff_t off)
5403 {
5404         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5405         unsigned long low;
5406         int err;
5407 
5408         buf = strstrip(buf);
5409         err = page_counter_memparse(buf, "max", &low);
5410         if (err)
5411                 return err;
5412 
5413         page_counter_set_low(&memcg->memory, low);
5414 
5415         return nbytes;
5416 }
5417 
5418 static int memory_high_show(struct seq_file *m, void *v)
5419 {
5420         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5421         unsigned long high = READ_ONCE(memcg->high);
5422 
5423         if (high == PAGE_COUNTER_MAX)
5424                 seq_puts(m, "max\n");
5425         else
5426                 seq_printf(m, "%llu\n", (u64)high * PAGE_SIZE);
5427 
5428         return 0;
5429 }
5430 
5431 static ssize_t memory_high_write(struct kernfs_open_file *of,
5432                                  char *buf, size_t nbytes, loff_t off)
5433 {
5434         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5435         unsigned long nr_pages;
5436         unsigned long high;
5437         int err;
5438 
5439         buf = strstrip(buf);
5440         err = page_counter_memparse(buf, "max", &high);
5441         if (err)
5442                 return err;
5443 
5444         memcg->high = high;
5445 
5446         nr_pages = page_counter_read(&memcg->memory);
5447         if (nr_pages > high)
5448                 try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
5449                                              GFP_KERNEL, true);
5450 
5451         memcg_wb_domain_size_changed(memcg);
5452         return nbytes;
5453 }
5454 
5455 static int memory_max_show(struct seq_file *m, void *v)
5456 {
5457         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5458         unsigned long max = READ_ONCE(memcg->memory.max);
5459 
5460         if (max == PAGE_COUNTER_MAX)
5461                 seq_puts(m, "max\n");
5462         else
5463                 seq_printf(m, "%llu\n", (u64)max * PAGE_SIZE);
5464 
5465         return 0;
5466 }
5467 
5468 static ssize_t memory_max_write(struct kernfs_open_file *of,
5469                                 char *buf, size_t nbytes, loff_t off)
5470 {
5471         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5472         unsigned int nr_reclaims = MEM_CGROUP_RECLAIM_RETRIES;
5473         bool drained = false;
5474         unsigned long max;
5475         int err;
5476 
5477         buf = strstrip(buf);
5478         err = page_counter_memparse(buf, "max", &max);
5479         if (err)
5480                 return err;
5481 
5482         xchg(&memcg->memory.max, max);
5483 
5484         for (;;) {
5485                 unsigned long nr_pages = page_counter_read(&memcg->memory);
5486 
5487                 if (nr_pages <= max)
5488                         break;
5489 
5490                 if (signal_pending(current)) {
5491                         err = -EINTR;
5492                         break;
5493                 }
5494 
5495                 if (!drained) {
5496                         drain_all_stock(memcg);
5497                         drained = true;
5498                         continue;
5499                 }
5500 
5501                 if (nr_reclaims) {
5502                         if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
5503                                                           GFP_KERNEL, true))
5504                                 nr_reclaims--;
5505                         continue;
5506                 }
5507 
5508                 memcg_memory_event(memcg, MEMCG_OOM);
5509                 if (!mem_cgroup_out_of_memory(memcg, GFP_KERNEL, 0))
5510                         break;
5511         }
5512 
5513         memcg_wb_domain_size_changed(memcg);
5514         return nbytes;
5515 }
5516 
5517 static int memory_events_show(struct seq_file *m, void *v)
5518 {
5519         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5520 
5521         seq_printf(m, "low %lu\n",
5522                    atomic_long_read(&memcg->memory_events[MEMCG_LOW]));
5523         seq_printf(m, "high %lu\n",
5524                    atomic_long_read(&memcg->memory_events[MEMCG_HIGH]));
5525         seq_printf(m, "max %lu\n",
5526                    atomic_long_read(&memcg->memory_events[MEMCG_MAX]));
5527         seq_printf(m, "oom %lu\n",
5528                    atomic_long_read(&memcg->memory_events[MEMCG_OOM]));
5529         seq_printf(m, "oom_kill %lu\n",
5530                    atomic_long_read(&memcg->memory_events[MEMCG_OOM_KILL]));
5531 
5532         return 0;
5533 }
5534 
5535 static int memory_stat_show(struct seq_file *m, void *v)
5536 {
5537         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5538         struct accumulated_stats acc;
5539         int i;
5540 
5541         /*
5542          * Provide statistics on the state of the memory subsystem as
5543          * well as cumulative event counters that show past behavior.
5544          *
5545          * This list is ordered following a combination of these gradients:
5546          * 1) generic big picture -> specifics and details
5547          * 2) reflecting userspace activity -> reflecting kernel heuristics
5548          *
5549          * Current memory state:
5550          */
5551 
5552         memset(&acc, 0, sizeof(acc));
5553         acc.stats_size = MEMCG_NR_STAT;
5554         acc.events_size = NR_VM_EVENT_ITEMS;
5555         accumulate_memcg_tree(memcg, &acc);
5556 
5557         seq_printf(m, "anon %llu\n",
5558                    (u64)acc.stat[MEMCG_RSS] * PAGE_SIZE);
5559         seq_printf(m, "file %llu\n",
5560                    (u64)acc.stat[MEMCG_CACHE] * PAGE_SIZE);
5561         seq_printf(m, "kernel_stack %llu\n",
5562                    (u64)acc.stat[MEMCG_KERNEL_STACK_KB] * 1024);
5563         seq_printf(m, "slab %llu\n",
5564                    (u64)(acc.stat[NR_SLAB_RECLAIMABLE] +
5565                          acc.stat[NR_SLAB_UNRECLAIMABLE]) * PAGE_SIZE);
5566         seq_printf(m, "sock %llu\n",
5567                    (u64)acc.stat[MEMCG_SOCK] * PAGE_SIZE);
5568 
5569         seq_printf(m, "shmem %llu\n",
5570                    (u64)acc.stat[NR_SHMEM] * PAGE_SIZE);
5571         seq_printf(m, "file_mapped %llu\n",
5572                    (u64)acc.stat[NR_FILE_MAPPED] * PAGE_SIZE);
5573         seq_printf(m, "file_dirty %llu\n",
5574                    (u64)acc.stat[NR_FILE_DIRTY] * PAGE_SIZE);
5575         seq_printf(m, "file_writeback %llu\n",
5576                    (u64)acc.stat[NR_WRITEBACK] * PAGE_SIZE);
5577 
5578         for (i = 0; i < NR_LRU_LISTS; i++)
5579                 seq_printf(m, "%s %llu\n", mem_cgroup_lru_names[i],
5580                            (u64)acc.lru_pages[i] * PAGE_SIZE);
5581 
5582         seq_printf(m, "slab_reclaimable %llu\n",
5583                    (u64)acc.stat[NR_SLAB_RECLAIMABLE] * PAGE_SIZE);
5584         seq_printf(m, "slab_unreclaimable %llu\n",
5585                    (u64)acc.stat[NR_SLAB_UNRECLAIMABLE] * PAGE_SIZE);
5586 
5587         /* Accumulated memory events */
5588 
5589         seq_printf(m, "pgfault %lu\n", acc.events[PGFAULT]);
5590         seq_printf(m, "pgmajfault %lu\n", acc.events[PGMAJFAULT]);
5591 
5592         seq_printf(m, "workingset_refault %lu\n",
5593                    acc.stat[WORKINGSET_REFAULT]);
5594         seq_printf(m, "workingset_activate %lu\n",
5595                    acc.stat[WORKINGSET_ACTIVATE]);
5596         seq_printf(m, "workingset_nodereclaim %lu\n",
5597                    acc.stat[WORKINGSET_NODERECLAIM]);
5598 
5599         seq_printf(m, "pgrefill %lu\n", acc.events[PGREFILL]);
5600         seq_printf(m, "pgscan %lu\n", acc.events[PGSCAN_KSWAPD] +
5601                    acc.events[PGSCAN_DIRECT]);
5602         seq_printf(m, "pgsteal %lu\n", acc.events[PGSTEAL_KSWAPD] +
5603                    acc.events[PGSTEAL_DIRECT]);
5604         seq_printf(m, "pgactivate %lu\n", acc.events[PGACTIVATE]);
5605         seq_printf(m, "pgdeactivate %lu\n", acc.events[PGDEACTIVATE]);
5606         seq_printf(m, "pglazyfree %lu\n", acc.events[PGLAZYFREE]);
5607         seq_printf(m, "pglazyfreed %lu\n", acc.events[PGLAZYFREED]);
5608 
5609         return 0;
5610 }
5611 
5612 static int memory_oom_group_show(struct seq_file *m, void *v)
5613 {
5614         struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m));
5615 
5616         seq_printf(m, "%d\n", memcg->oom_group);
5617 
5618         return 0;
5619 }
5620 
5621 static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
5622                                       char *buf, size_t nbytes, loff_t off)
5623 {
5624         struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
5625         int ret, oom_group;
5626 
5627         buf = strstrip(buf);
5628         if (!buf)
5629                 return -EINVAL;
5630 
5631         ret = kstrtoint(buf, 0, &oom_group);
5632         if (ret)
5633                 return ret;
5634 
5635         if (oom_group != 0 && oom_group != 1)
5636                 return -EINVAL;
5637 
5638         memcg->oom_group = oom_group;
5639 
5640         return nbytes;
5641 }
5642 
5643 static struct cftype memory_files[] = {
5644         {
5645                 .name = "current",
5646                 .flags = CFTYPE_NOT_ON_ROOT,
5647                 .read_u64 = memory_current_read,
5648         },
5649         {
5650                 .name = "min",
5651                 .flags = CFTYPE_NOT_ON_ROOT,
5652                 .seq_show = memory_min_show,
5653                 .write = memory_min_write,
5654         },
5655         {
5656                 .name = "low",
5657                 .flags = CFTYPE_NOT_ON_ROOT,
5658                 .seq_show = memory_low_show,
5659                 .write = memory_low_write,
5660         },
5661         {
5662                 .name = "high",
5663                 .flags = CFTYPE_NOT_ON_ROOT,
5664                 .seq_show = memory_high_show,
5665                 .write = memory_high_write,
5666         },
5667         {
5668                 .name = "max",
5669                 .flags = CFTYPE_NOT_ON_ROOT,
5670                 .seq_show = memory_max_show,
5671                 .write = memory_max_write,
5672         },
5673         {
5674                 .name = "events",
5675                 .flags = CFTYPE_NOT_ON_ROOT,
5676                 .file_offset = offsetof(struct mem_cgroup, events_file),
5677                 .seq_show = memory_events_show,
5678         },
5679         {
5680                 .name = "stat",
5681                 .flags = CFTYPE_NOT_ON_ROOT,
5682                 .seq_show = memory_stat_show,
5683         },
5684         {
5685                 .name = "oom.group",
5686                 .flags = CFTYPE_NOT_ON_ROOT | CFTYPE_NS_DELEGATABLE,
5687                 .seq_show = memory_oom_group_show,
5688                 .write = memory_oom_group_write,
5689         },
5690         { }     /* terminate */
5691 };
5692 
5693 struct cgroup_subsys memory_cgrp_subsys = {
5694         .css_alloc = mem_cgroup_css_alloc,
5695         .css_online = mem_cgroup_css_online,
5696         .css_offline = mem_cgroup_css_offline,
5697         .css_released = mem_cgroup_css_released,
5698         .css_free = mem_cgroup_css_free,
5699         .css_reset = mem_cgroup_css_reset,
5700         .can_attach = mem_cgroup_can_attach,
5701         .cancel_attach = mem_cgroup_cancel_attach,
5702         .post_attach = mem_cgroup_move_task,
5703         .bind = mem_cgroup_bind,
5704         .dfl_cftypes = memory_files,
5705         .legacy_cftypes = mem_cgroup_legacy_files,
5706         .early_init = 0,
5707 };
5708 
5709 /**
5710  * mem_cgroup_protected - check if memory consumption is in the normal range
5711  * @root: the top ancestor of the sub-tree being checked
5712  * @memcg: the memory cgroup to check
5713  *
5714  * WARNING: This function is not stateless! It can only be used as part
5715  *          of a top-down tree iteration, not for isolated queries.
5716  *
5717  * Returns one of the following:
5718  *   MEMCG_PROT_NONE: cgroup memory is not protected
5719  *   MEMCG_PROT_LOW: cgroup memory is protected as long there is
5720  *     an unprotected supply of reclaimable memory from other cgroups.
5721  *   MEMCG_PROT_MIN: cgroup memory is protected
5722  *
5723  * @root is exclusive; it is never protected when looked at directly
5724  *
5725  * To provide a proper hierarchical behavior, effective memory.min/low values
5726  * are used. Below is the description of how effective memory.low is calculated.
5727  * Effective memory.min values is calculated in the same way.
5728  *
5729  * Effective memory.low is always equal or less than the original memory.low.
5730  * If there is no memory.low overcommittment (which is always true for
5731  * top-level memory cgroups), these two values are equal.
5732  * Otherwise, it's a part of parent's effective memory.low,
5733  * calculated as a cgroup's memory.low usage divided by sum of sibling's
5734  * memory.low usages, where memory.low usage is the size of actually
5735  * protected memory.
5736  *
5737  *                                             low_usage
5738  * elow = min( memory.low, parent->elow * ------------------ ),
5739  *                                        siblings_low_usage
5740  *
5741  *             | memory.current, if memory.current < memory.low
5742  * low_usage = |
5743                | 0, otherwise.
5744  *
5745  *
5746  * Such definition of the effective memory.low provides the expected
5747  * hierarchical behavior: parent's memory.low value is limiting
5748  * children, unprotected memory is reclaimed first and cgroups,
5749  * which are not using their guarantee do not affect actual memory
5750  * distribution.
5751  *
5752  * For example, if there are memcgs A, A/B, A/C, A/D and A/E:
5753  *
5754  *     A      A/memory.low = 2G, A/memory.current = 6G
5755  *    //\\
5756  *   BC  DE   B/memory.low = 3G  B/memory.current = 2G
5757  *            C/memory.low = 1G  C/memory.current = 2G
5758  *            D/memory.low = 0   D/memory.current = 2G
5759  *            E/memory.low = 10G E/memory.current = 0
5760  *
5761  * and the memory pressure is applied, the following memory distribution
5762  * is expected (approximately):
5763  *
5764  *     A/memory.current = 2G
5765  *
5766  *     B/memory.current = 1.3G
5767  *     C/memory.current = 0.6G
5768  *     D/memory.current = 0
5769  *     E/memory.current = 0
5770  *
5771  * These calculations require constant tracking of the actual low usages
5772  * (see propagate_protected_usage()), as well as recursive calculation of
5773  * effective memory.low values. But as we do call mem_cgroup_protected()
5774  * path for each memory cgroup top-down from the reclaim,
5775  * it's possible to optimize this part, and save calculated elow
5776  * for next usage. This part is intentionally racy, but it's ok,
5777  * as memory.low is a best-effort mechanism.
5778  */
5779 enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root,
5780                                                 struct mem_cgroup *memcg)
5781 {
5782         struct mem_cgroup *parent;
5783         unsigned long emin, parent_emin;
5784         unsigned long elow, parent_elow;
5785         unsigned long usage;
5786 
5787         if (mem_cgroup_disabled())
5788                 return MEMCG_PROT_NONE;
5789 
5790         if (!root)
5791                 root = root_mem_cgroup;
5792         if (memcg == root)
5793                 return MEMCG_PROT_NONE;
5794 
5795         usage = page_counter_read(&memcg->memory);
5796         if (!usage)
5797                 return MEMCG_PROT_NONE;
5798 
5799         emin = memcg->memory.min;
5800         elow = memcg->memory.low;
5801 
5802         parent = parent_mem_cgroup(memcg);
5803         /* No parent means a non-hierarchical mode on v1 memcg */
5804         if (!parent)
5805                 return MEMCG_PROT_NONE;
5806 
5807         if (parent == root)
5808                 goto exit;
5809 
5810         parent_emin = READ_ONCE(parent->memory.emin);
5811         emin = min(emin, parent_emin);
5812         if (emin && parent_emin) {
5813                 unsigned long min_usage, siblings_min_usage;
5814 
5815                 min_usage = min(usage, memcg->memory.min);
5816                 siblings_min_usage = atomic_long_read(
5817                         &parent->memory.children_min_usage);
5818 
5819                 if (min_usage && siblings_min_usage)
5820                         emin = min(emin, parent_emin * min_usage /
5821                                    siblings_min_usage);
5822         }
5823 
5824         parent_elow = READ_ONCE(parent->memory.elow);
5825         elow = min(elow, parent_elow);
5826         if (elow && parent_elow) {
5827                 unsigned long low_usage, siblings_low_usage;
5828 
5829                 low_usage = min(usage, memcg->memory.low);
5830                 siblings_low_usage = atomic_long_read(
5831                         &parent->memory.children_low_usage);
5832 
5833                 if (low_usage && siblings_low_usage)
5834                         elow = min(elow, parent_elow * low_usage /
5835                                    siblings_low_usage);
5836         }
5837 
5838 exit:
5839         memcg->memory.emin = emin;
5840         memcg->memory.elow = elow;
5841 
5842         if (usage <= emin)
5843                 return MEMCG_PROT_MIN;
5844         else if (usage <= elow)
5845                 return MEMCG_PROT_LOW;
5846         else
5847                 return MEMCG_PROT_NONE;
5848 }
5849 
5850 /**
5851  * mem_cgroup_try_charge - try charging a page
5852  * @page: page to charge
5853  * @mm: mm context of the victim
5854  * @gfp_mask: reclaim mode
5855  * @memcgp: charged memcg return
5856  * @compound: charge the page as compound or small page
5857  *
5858  * Try to charge @page to the memcg that @mm belongs to, reclaiming
5859  * pages according to @gfp_mask if necessary.
5860  *
5861  * Returns 0 on success, with *@memcgp pointing to the charged memcg.
5862  * Otherwise, an error code is returned.
5863  *
5864  * After page->mapping has been set up, the caller must finalize the
5865  * charge with mem_cgroup_commit_charge().  Or abort the transaction
5866  * with mem_cgroup_cancel_charge() in case page instantiation fails.
5867  */
5868 int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm,
5869                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
5870                           bool compound)
5871 {
5872         struct mem_cgroup *memcg = NULL;
5873         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5874         int ret = 0;
5875 
5876         if (mem_cgroup_disabled())
5877                 goto out;
5878 
5879         if (PageSwapCache(page)) {
5880                 /*
5881                  * Every swap fault against a single page tries to charge the
5882                  * page, bail as early as possible.  shmem_unuse() encounters
5883                  * already charged pages, too.  The USED bit is protected by
5884                  * the page lock, which serializes swap cache removal, which
5885                  * in turn serializes uncharging.
5886                  */
5887                 VM_BUG_ON_PAGE(!PageLocked(page), page);
5888                 if (compound_head(page)->mem_cgroup)
5889                         goto out;
5890 
5891                 if (do_swap_account) {
5892                         swp_entry_t ent = { .val = page_private(page), };
5893                         unsigned short id = lookup_swap_cgroup_id(ent);
5894 
5895                         rcu_read_lock();
5896                         memcg = mem_cgroup_from_id(id);
5897                         if (memcg && !css_tryget_online(&memcg->css))
5898                                 memcg = NULL;
5899                         rcu_read_unlock();
5900                 }
5901         }
5902 
5903         if (!memcg)
5904                 memcg = get_mem_cgroup_from_mm(mm);
5905 
5906         ret = try_charge(memcg, gfp_mask, nr_pages);
5907 
5908         css_put(&memcg->css);
5909 out:
5910         *memcgp = memcg;
5911         return ret;
5912 }
5913 
5914 int mem_cgroup_try_charge_delay(struct page *page, struct mm_struct *mm,
5915                           gfp_t gfp_mask, struct mem_cgroup **memcgp,
5916                           bool compound)
5917 {
5918         struct mem_cgroup *memcg;
5919         int ret;
5920 
5921         ret = mem_cgroup_try_charge(page, mm, gfp_mask, memcgp, compound);
5922         memcg = *memcgp;
5923         mem_cgroup_throttle_swaprate(memcg, page_to_nid(page), gfp_mask);
5924         return ret;
5925 }
5926 
5927 /**
5928  * mem_cgroup_commit_charge - commit a page charge
5929  * @page: page to charge
5930  * @memcg: memcg to charge the page to
5931  * @lrucare: page might be on LRU already
5932  * @compound: charge the page as compound or small page
5933  *
5934  * Finalize a charge transaction started by mem_cgroup_try_charge(),
5935  * after page->mapping has been set up.  This must happen atomically
5936  * as part of the page instantiation, i.e. under the page table lock
5937  * for anonymous pages, under the page lock for page and swap cache.
5938  *
5939  * In addition, the page must not be on the LRU during the commit, to
5940  * prevent racing with task migration.  If it might be, use @lrucare.
5941  *
5942  * Use mem_cgroup_cancel_charge() to cancel the transaction instead.
5943  */
5944 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
5945                               bool lrucare, bool compound)
5946 {
5947         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5948 
5949         VM_BUG_ON_PAGE(!page->mapping, page);
5950         VM_BUG_ON_PAGE(PageLRU(page) && !lrucare, page);
5951 
5952         if (mem_cgroup_disabled())
5953                 return;
5954         /*
5955          * Swap faults will attempt to charge the same page multiple
5956          * times.  But reuse_swap_page() might have removed the page
5957          * from swapcache already, so we can't check PageSwapCache().
5958          */
5959         if (!memcg)
5960                 return;
5961 
5962         commit_charge(page, memcg, lrucare);
5963 
5964         local_irq_disable();
5965         mem_cgroup_charge_statistics(memcg, page, compound, nr_pages);
5966         memcg_check_events(memcg, page);
5967         local_irq_enable();
5968 
5969         if (do_memsw_account() && PageSwapCache(page)) {
5970                 swp_entry_t entry = { .val = page_private(page) };
5971                 /*
5972                  * The swap entry might not get freed for a long time,
5973                  * let's not wait for it.  The page already received a
5974                  * memory+swap charge, drop the swap entry duplicate.
5975                  */
5976                 mem_cgroup_uncharge_swap(entry, nr_pages);
5977         }
5978 }
5979 
5980 /**
5981  * mem_cgroup_cancel_charge - cancel a page charge
5982  * @page: page to charge
5983  * @memcg: memcg to charge the page to
5984  * @compound: charge the page as compound or small page
5985  *
5986  * Cancel a charge transaction started by mem_cgroup_try_charge().
5987  */
5988 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg,
5989                 bool compound)
5990 {
5991         unsigned int nr_pages = compound ? hpage_nr_pages(page) : 1;
5992 
5993         if (mem_cgroup_disabled())
5994                 return;
5995         /*
5996          * Swap faults will attempt to charge the same page multiple
5997          * times.  But reuse_swap_page() might have removed the page
5998          * from swapcache already, so we can't check PageSwapCache().
5999          */
6000         if (!memcg)
6001                 return;
6002 
6003         cancel_charge(memcg, nr_pages);
6004 }
6005 
6006 struct uncharge_gather {
6007         struct mem_cgroup *memcg;
6008         unsigned long pgpgout;
6009         unsigned long nr_anon;
6010         unsigned long nr_file;
6011         unsigned long nr_kmem;
6012         unsigned long nr_huge;
6013         unsigned long nr_shmem;
6014         struct page *dummy_page;
6015 };
6016 
6017 static inline void uncharge_gather_clear(struct uncharge_gather *ug)
6018 {
6019         memset(ug, 0, sizeof(*ug));
6020 }
6021 
6022 static void uncharge_batch(const struct uncharge_gather *ug)
6023 {
6024         unsigned long nr_pages = ug->nr_anon + ug->nr_file + ug->nr_kmem;
6025         unsigned long flags;
6026 
6027         if (!mem_cgroup_is_root(ug->memcg)) {
6028                 page_counter_uncharge(&ug->memcg->memory, nr_pages);
6029                 if (do_memsw_account())
6030                         page_counter_uncharge(&ug->memcg->memsw, nr_pages);
6031                 if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && ug->nr_kmem)
6032                         page_counter_uncharge(&ug->memcg->kmem, ug->nr_kmem);
6033                 memcg_oom_recover(ug->memcg);
6034         }
6035 
6036         local_irq_save(flags);
6037         __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon);
6038         __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file);
6039         __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge);
6040         __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem);
6041         __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout);
6042         __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages);
6043         memcg_check_events(ug->memcg, ug->dummy_page);
6044         local_irq_restore(flags);
6045 
6046         if (!mem_cgroup_is_root(ug->memcg))
6047                 css_put_many(&ug->memcg->css, nr_pages);
6048 }
6049 
6050 static void uncharge_page(struct page *page, struct uncharge_gather *ug)
6051 {
6052         VM_BUG_ON_PAGE(PageLRU(page), page);
6053         VM_BUG_ON_PAGE(page_count(page) && !is_zone_device_page(page) &&
6054                         !PageHWPoison(page) , page);
6055 
6056         if (!page->mem_cgroup)
6057                 return;
6058 
6059         /*
6060          * Nobody should be changing or seriously looking at
6061          * page->mem_cgroup at this point, we have fully
6062          * exclusive access to the page.
6063          */
6064 
6065         if (ug->memcg != page->mem_cgroup) {
6066                 if (ug->memcg) {
6067                         uncharge_batch(ug);
6068                         uncharge_gather_clear(ug);
6069                 }
6070                 ug->memcg = page->mem_cgroup;
6071         }
6072 
6073         if (!PageKmemcg(page)) {
6074                 unsigned int nr_pages = 1;
6075 
6076                 if (PageTransHuge(page)) {
6077                         nr_pages <<= compound_order(page);
6078                         ug->nr_huge += nr_pages;
6079                 }
6080                 if (PageAnon(page))
6081                         ug->nr_anon += nr_pages;
6082                 else {
6083                         ug->nr_file += nr_pages;
6084                         if (PageSwapBacked(page))
6085                                 ug->nr_shmem += nr_pages;
6086                 }
6087                 ug->pgpgout++;
6088         } else {
6089                 ug->nr_kmem += 1 << compound_order(page);
6090                 __ClearPageKmemcg(page);
6091         }
6092 
6093         ug->dummy_page = page;
6094         page->mem_cgroup = NULL;
6095 }
6096 
6097 static void uncharge_list(struct list_head *page_list)
6098 {
6099         struct uncharge_gather ug;
6100         struct list_head *next;
6101 
6102         uncharge_gather_clear(&ug);
6103 
6104         /*
6105          * Note that the list can be a single page->lru; hence the
6106          * do-while loop instead of a simple list_for_each_entry().
6107          */
6108         next = page_list->next;
6109         do {
6110                 struct page *page;
6111 
6112                 page = list_entry(next, struct page, lru);
6113                 next = page->lru.next;
6114 
6115                 uncharge_page(page, &ug);
6116         } while (next != page_list);
6117 
6118         if (ug.memcg)
6119                 uncharge_batch(&ug);
6120 }
6121 
6122 /**
6123  * mem_cgroup_uncharge - uncharge a page
6124  * @page: page to uncharge
6125  *
6126  * Uncharge a page previously charged with mem_cgroup_try_charge() and
6127  * mem_cgroup_commit_charge().
6128  */
6129 void mem_cgroup_uncharge(struct page *page)
6130 {
6131         struct uncharge_gather ug;
6132 
6133         if (mem_cgroup_disabled())
6134                 return;
6135 
6136         /* Don't touch page->lru of any random page, pre-check: */
6137         if (!page->mem_cgroup)
6138                 return;
6139 
6140         uncharge_gather_clear(&ug);
6141         uncharge_page(page, &ug);
6142         uncharge_batch(&ug);
6143 }
6144 
6145 /**
6146  * mem_cgroup_uncharge_list - uncharge a list of page
6147  * @page_list: list of pages to uncharge
6148  *
6149  * Uncharge a list of pages previously charged with
6150  * mem_cgroup_try_charge() and mem_cgroup_commit_charge().
6151  */
6152 void mem_cgroup_uncharge_list(struct list_head *page_list)
6153 {
6154         if (mem_cgroup_disabled())
6155                 return;
6156 
6157         if (!list_empty(page_list))
6158                 uncharge_list(page_list);
6159 }
6160 
6161 /**
6162  * mem_cgroup_migrate - charge a page's replacement
6163  * @oldpage: currently circulating page
6164  * @newpage: replacement page
6165  *
6166  * Charge @newpage as a replacement page for @oldpage. @oldpage will
6167  * be uncharged upon free.
6168  *
6169  * Both pages must be locked, @newpage->mapping must be set up.
6170  */
6171 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage)
6172 {
6173         struct mem_cgroup *memcg;
6174         unsigned int nr_pages;
6175         bool compound;
6176         unsigned long flags;
6177 
6178         VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage);
6179         VM_BUG_ON_PAGE(!PageLocked(newpage), newpage);
6180         VM_BUG_ON_PAGE(PageAnon(oldpage) != PageAnon(newpage), newpage);
6181         VM_BUG_ON_PAGE(PageTransHuge(oldpage) != PageTransHuge(newpage),
6182                        newpage);
6183 
6184         if (mem_cgroup_disabled())
6185                 return;
6186 
6187         /* Page cache replacement: new page already charged? */
6188         if (newpage->mem_cgroup)
6189                 return;
6190 
6191         /* Swapcache readahead pages can get replaced before being charged */
6192         memcg = oldpage->mem_cgroup;
6193         if (!memcg)
6194                 return;
6195 
6196         /* Force-charge the new page. The old one will be freed soon */
6197         compound = PageTransHuge(newpage);
6198         nr_pages = compound ? hpage_nr_pages(newpage) : 1;
6199 
6200         page_counter_charge(&memcg->memory, nr_pages);
6201         if (do_memsw_account())
6202                 page_counter_charge(&memcg->memsw, nr_pages);
6203         css_get_many(&memcg->css, nr_pages);
6204 
6205         commit_charge(newpage, memcg, false);
6206 
6207         local_irq_save(flags);
6208         mem_cgroup_charge_statistics(memcg, newpage, compound, nr_pages);
6209         memcg_check_events(memcg, newpage);
6210         local_irq_restore(flags);
6211 }
6212 
6213 DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key);
6214 EXPORT_SYMBOL(memcg_sockets_enabled_key);
6215 
6216 void mem_cgroup_sk_alloc(struct sock *sk)
6217 {
6218         struct mem_cgroup *memcg;
6219 
6220         if (!m