1 /* memcontrol.c - Memory Controller 2 * 3 * Copyright IBM Corporation, 2007 4 * Author Balbir Singh <balbir@linux.vnet.ibm.com> 5 * 6 * Copyright 2007 OpenVZ SWsoft Inc 7 * Author: Pavel Emelianov <xemul@openvz.org> 8 * 9 * Memory thresholds 10 * Copyright (C) 2009 Nokia Corporation 11 * Author: Kirill A. Shutemov 12 * 13 * Kernel Memory Controller 14 * Copyright (C) 2012 Parallels Inc. and Google Inc. 15 * Authors: Glauber Costa and Suleiman Souhlal 16 * 17 * This program is free software; you can redistribute it and/or modify 18 * it under the terms of the GNU General Public License as published by 19 * the Free Software Foundation; either version 2 of the License, or 20 * (at your option) any later version. 21 * 22 * This program is distributed in the hope that it will be useful, 23 * but WITHOUT ANY WARRANTY; without even the implied warranty of 24 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 25 * GNU General Public License for more details. 26 */ 27 28 #include <linux/res_counter.h> 29 #include <linux/memcontrol.h> 30 #include <linux/cgroup.h> 31 #include <linux/mm.h> 32 #include <linux/hugetlb.h> 33 #include <linux/pagemap.h> 34 #include <linux/smp.h> 35 #include <linux/page-flags.h> 36 #include <linux/backing-dev.h> 37 #include <linux/bit_spinlock.h> 38 #include <linux/rcupdate.h> 39 #include <linux/limits.h> 40 #include <linux/export.h> 41 #include <linux/mutex.h> 42 #include <linux/rbtree.h> 43 #include <linux/slab.h> 44 #include <linux/swap.h> 45 #include <linux/swapops.h> 46 #include <linux/spinlock.h> 47 #include <linux/eventfd.h> 48 #include <linux/sort.h> 49 #include <linux/fs.h> 50 #include <linux/seq_file.h> 51 #include <linux/vmalloc.h> 52 #include <linux/vmpressure.h> 53 #include <linux/mm_inline.h> 54 #include <linux/page_cgroup.h> 55 #include <linux/cpu.h> 56 #include <linux/oom.h> 57 #include <linux/lockdep.h> 58 #include "internal.h" 59 #include <net/sock.h> 60 #include <net/ip.h> 61 #include <net/tcp_memcontrol.h> 62 #include "slab.h" 63 64 #include <asm/uaccess.h> 65 66 #include <trace/events/vmscan.h> 67 68 struct cgroup_subsys mem_cgroup_subsys __read_mostly; 69 EXPORT_SYMBOL(mem_cgroup_subsys); 70 71 #define MEM_CGROUP_RECLAIM_RETRIES 5 72 static struct mem_cgroup *root_mem_cgroup __read_mostly; 73 74 #ifdef CONFIG_MEMCG_SWAP 75 /* Turned on only when memory cgroup is enabled && really_do_swap_account = 1 */ 76 int do_swap_account __read_mostly; 77 78 /* for remember boot option*/ 79 #ifdef CONFIG_MEMCG_SWAP_ENABLED 80 static int really_do_swap_account __initdata = 1; 81 #else 82 static int really_do_swap_account __initdata = 0; 83 #endif 84 85 #else 86 #define do_swap_account 0 87 #endif 88 89 90 static const char * const mem_cgroup_stat_names[] = { 91 "cache", 92 "rss", 93 "rss_huge", 94 "mapped_file", 95 "writeback", 96 "swap", 97 }; 98 99 enum mem_cgroup_events_index { 100 MEM_CGROUP_EVENTS_PGPGIN, /* # of pages paged in */ 101 MEM_CGROUP_EVENTS_PGPGOUT, /* # of pages paged out */ 102 MEM_CGROUP_EVENTS_PGFAULT, /* # of page-faults */ 103 MEM_CGROUP_EVENTS_PGMAJFAULT, /* # of major page-faults */ 104 MEM_CGROUP_EVENTS_NSTATS, 105 }; 106 107 static const char * const mem_cgroup_events_names[] = { 108 "pgpgin", 109 "pgpgout", 110 "pgfault", 111 "pgmajfault", 112 }; 113 114 static const char * const mem_cgroup_lru_names[] = { 115 "inactive_anon", 116 "active_anon", 117 "inactive_file", 118 "active_file", 119 "unevictable", 120 }; 121 122 /* 123 * Per memcg event counter is incremented at every pagein/pageout. With THP, 124 * it will be incremated by the number of pages. This counter is used for 125 * for trigger some periodic events. This is straightforward and better 126 * than using jiffies etc. to handle periodic memcg event. 127 */ 128 enum mem_cgroup_events_target { 129 MEM_CGROUP_TARGET_THRESH, 130 MEM_CGROUP_TARGET_SOFTLIMIT, 131 MEM_CGROUP_TARGET_NUMAINFO, 132 MEM_CGROUP_NTARGETS, 133 }; 134 #define THRESHOLDS_EVENTS_TARGET 128 135 #define SOFTLIMIT_EVENTS_TARGET 1024 136 #define NUMAINFO_EVENTS_TARGET 1024 137 138 struct mem_cgroup_stat_cpu { 139 long count[MEM_CGROUP_STAT_NSTATS]; 140 unsigned long events[MEM_CGROUP_EVENTS_NSTATS]; 141 unsigned long nr_page_events; 142 unsigned long targets[MEM_CGROUP_NTARGETS]; 143 }; 144 145 struct mem_cgroup_reclaim_iter { 146 /* 147 * last scanned hierarchy member. Valid only if last_dead_count 148 * matches memcg->dead_count of the hierarchy root group. 149 */ 150 struct mem_cgroup *last_visited; 151 unsigned long last_dead_count; 152 153 /* scan generation, increased every round-trip */ 154 unsigned int generation; 155 }; 156 157 /* 158 * per-zone information in memory controller. 159 */ 160 struct mem_cgroup_per_zone { 161 struct lruvec lruvec; 162 unsigned long lru_size[NR_LRU_LISTS]; 163 164 struct mem_cgroup_reclaim_iter reclaim_iter[DEF_PRIORITY + 1]; 165 166 struct rb_node tree_node; /* RB tree node */ 167 unsigned long long usage_in_excess;/* Set to the value by which */ 168 /* the soft limit is exceeded*/ 169 bool on_tree; 170 struct mem_cgroup *memcg; /* Back pointer, we cannot */ 171 /* use container_of */ 172 }; 173 174 struct mem_cgroup_per_node { 175 struct mem_cgroup_per_zone zoneinfo[MAX_NR_ZONES]; 176 }; 177 178 /* 179 * Cgroups above their limits are maintained in a RB-Tree, independent of 180 * their hierarchy representation 181 */ 182 183 struct mem_cgroup_tree_per_zone { 184 struct rb_root rb_root; 185 spinlock_t lock; 186 }; 187 188 struct mem_cgroup_tree_per_node { 189 struct mem_cgroup_tree_per_zone rb_tree_per_zone[MAX_NR_ZONES]; 190 }; 191 192 struct mem_cgroup_tree { 193 struct mem_cgroup_tree_per_node *rb_tree_per_node[MAX_NUMNODES]; 194 }; 195 196 static struct mem_cgroup_tree soft_limit_tree __read_mostly; 197 198 struct mem_cgroup_threshold { 199 struct eventfd_ctx *eventfd; 200 u64 threshold; 201 }; 202 203 /* For threshold */ 204 struct mem_cgroup_threshold_ary { 205 /* An array index points to threshold just below or equal to usage. */ 206 int current_threshold; 207 /* Size of entries[] */ 208 unsigned int size; 209 /* Array of thresholds */ 210 struct mem_cgroup_threshold entries[0]; 211 }; 212 213 struct mem_cgroup_thresholds { 214 /* Primary thresholds array */ 215 struct mem_cgroup_threshold_ary *primary; 216 /* 217 * Spare threshold array. 218 * This is needed to make mem_cgroup_unregister_event() "never fail". 219 * It must be able to store at least primary->size - 1 entries. 220 */ 221 struct mem_cgroup_threshold_ary *spare; 222 }; 223 224 /* for OOM */ 225 struct mem_cgroup_eventfd_list { 226 struct list_head list; 227 struct eventfd_ctx *eventfd; 228 }; 229 230 static void mem_cgroup_threshold(struct mem_cgroup *memcg); 231 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg); 232 233 /* 234 * The memory controller data structure. The memory controller controls both 235 * page cache and RSS per cgroup. We would eventually like to provide 236 * statistics based on the statistics developed by Rik Van Riel for clock-pro, 237 * to help the administrator determine what knobs to tune. 238 * 239 * TODO: Add a water mark for the memory controller. Reclaim will begin when 240 * we hit the water mark. May be even add a low water mark, such that 241 * no reclaim occurs from a cgroup at it's low water mark, this is 242 * a feature that will be implemented much later in the future. 243 */ 244 struct mem_cgroup { 245 struct cgroup_subsys_state css; 246 /* 247 * the counter to account for memory usage 248 */ 249 struct res_counter res; 250 251 /* vmpressure notifications */ 252 struct vmpressure vmpressure; 253 254 /* 255 * the counter to account for mem+swap usage. 256 */ 257 struct res_counter memsw; 258 259 /* 260 * the counter to account for kernel memory usage. 261 */ 262 struct res_counter kmem; 263 /* 264 * Should the accounting and control be hierarchical, per subtree? 265 */ 266 bool use_hierarchy; 267 unsigned long kmem_account_flags; /* See KMEM_ACCOUNTED_*, below */ 268 269 bool oom_lock; 270 atomic_t under_oom; 271 atomic_t oom_wakeups; 272 273 int swappiness; 274 /* OOM-Killer disable */ 275 int oom_kill_disable; 276 277 /* set when res.limit == memsw.limit */ 278 bool memsw_is_minimum; 279 280 /* protect arrays of thresholds */ 281 struct mutex thresholds_lock; 282 283 /* thresholds for memory usage. RCU-protected */ 284 struct mem_cgroup_thresholds thresholds; 285 286 /* thresholds for mem+swap usage. RCU-protected */ 287 struct mem_cgroup_thresholds memsw_thresholds; 288 289 /* For oom notifier event fd */ 290 struct list_head oom_notify; 291 292 /* 293 * Should we move charges of a task when a task is moved into this 294 * mem_cgroup ? And what type of charges should we move ? 295 */ 296 unsigned long move_charge_at_immigrate; 297 /* 298 * set > 0 if pages under this cgroup are moving to other cgroup. 299 */ 300 atomic_t moving_account; 301 /* taken only while moving_account > 0 */ 302 spinlock_t move_lock; 303 /* 304 * percpu counter. 305 */ 306 struct mem_cgroup_stat_cpu __percpu *stat; 307 /* 308 * used when a cpu is offlined or other synchronizations 309 * See mem_cgroup_read_stat(). 310 */ 311 struct mem_cgroup_stat_cpu nocpu_base; 312 spinlock_t pcp_counter_lock; 313 314 atomic_t dead_count; 315 #if defined(CONFIG_MEMCG_KMEM) && defined(CONFIG_INET) 316 struct cg_proto tcp_mem; 317 #endif 318 #if defined(CONFIG_MEMCG_KMEM) 319 /* analogous to slab_common's slab_caches list. per-memcg */ 320 struct list_head memcg_slab_caches; 321 /* Not a spinlock, we can take a lot of time walking the list */ 322 struct mutex slab_caches_mutex; 323 /* Index in the kmem_cache->memcg_params->memcg_caches array */ 324 int kmemcg_id; 325 #endif 326 327 int last_scanned_node; 328 #if MAX_NUMNODES > 1 329 nodemask_t scan_nodes; 330 atomic_t numainfo_events; 331 atomic_t numainfo_updating; 332 #endif 333 334 struct mem_cgroup_per_node *nodeinfo[0]; 335 /* WARNING: nodeinfo must be the last member here */ 336 }; 337 338 static size_t memcg_size(void) 339 { 340 return sizeof(struct mem_cgroup) + 341 nr_node_ids * sizeof(struct mem_cgroup_per_node *); 342 } 343 344 /* internal only representation about the status of kmem accounting. */ 345 enum { 346 KMEM_ACCOUNTED_ACTIVE = 0, /* accounted by this cgroup itself */ 347 KMEM_ACCOUNTED_ACTIVATED, /* static key enabled. */ 348 KMEM_ACCOUNTED_DEAD, /* dead memcg with pending kmem charges */ 349 }; 350 351 /* We account when limit is on, but only after call sites are patched */ 352 #define KMEM_ACCOUNTED_MASK \ 353 ((1 << KMEM_ACCOUNTED_ACTIVE) | (1 << KMEM_ACCOUNTED_ACTIVATED)) 354 355 #ifdef CONFIG_MEMCG_KMEM 356 static inline void memcg_kmem_set_active(struct mem_cgroup *memcg) 357 { 358 set_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 359 } 360 361 static bool memcg_kmem_is_active(struct mem_cgroup *memcg) 362 { 363 return test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags); 364 } 365 366 static void memcg_kmem_set_activated(struct mem_cgroup *memcg) 367 { 368 set_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 369 } 370 371 static void memcg_kmem_clear_activated(struct mem_cgroup *memcg) 372 { 373 clear_bit(KMEM_ACCOUNTED_ACTIVATED, &memcg->kmem_account_flags); 374 } 375 376 static void memcg_kmem_mark_dead(struct mem_cgroup *memcg) 377 { 378 /* 379 * Our caller must use css_get() first, because memcg_uncharge_kmem() 380 * will call css_put() if it sees the memcg is dead. 381 */ 382 smp_wmb(); 383 if (test_bit(KMEM_ACCOUNTED_ACTIVE, &memcg->kmem_account_flags)) 384 set_bit(KMEM_ACCOUNTED_DEAD, &memcg->kmem_account_flags); 385 } 386 387 static bool memcg_kmem_test_and_clear_dead(struct mem_cgroup *memcg) 388 { 389 return test_and_clear_bit(KMEM_ACCOUNTED_DEAD, 390 &memcg->kmem_account_flags); 391 } 392 #endif 393 394 /* Stuffs for move charges at task migration. */ 395 /* 396 * Types of charges to be moved. "move_charge_at_immitgrate" and 397 * "immigrate_flags" are treated as a left-shifted bitmap of these types. 398 */ 399 enum move_type { 400 MOVE_CHARGE_TYPE_ANON, /* private anonymous page and swap of it */ 401 MOVE_CHARGE_TYPE_FILE, /* file page(including tmpfs) and swap of it */ 402 NR_MOVE_TYPE, 403 }; 404 405 /* "mc" and its members are protected by cgroup_mutex */ 406 static struct move_charge_struct { 407 spinlock_t lock; /* for from, to */ 408 struct mem_cgroup *from; 409 struct mem_cgroup *to; 410 unsigned long immigrate_flags; 411 unsigned long precharge; 412 unsigned long moved_charge; 413 unsigned long moved_swap; 414 struct task_struct *moving_task; /* a task moving charges */ 415 wait_queue_head_t waitq; /* a waitq for other context */ 416 } mc = { 417 .lock = __SPIN_LOCK_UNLOCKED(mc.lock), 418 .waitq = __WAIT_QUEUE_HEAD_INITIALIZER(mc.waitq), 419 }; 420 421 static bool move_anon(void) 422 { 423 return test_bit(MOVE_CHARGE_TYPE_ANON, &mc.immigrate_flags); 424 } 425 426 static bool move_file(void) 427 { 428 return test_bit(MOVE_CHARGE_TYPE_FILE, &mc.immigrate_flags); 429 } 430 431 /* 432 * Maximum loops in mem_cgroup_hierarchical_reclaim(), used for soft 433 * limit reclaim to prevent infinite loops, if they ever occur. 434 */ 435 #define MEM_CGROUP_MAX_RECLAIM_LOOPS 100 436 #define MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS 2 437 438 enum charge_type { 439 MEM_CGROUP_CHARGE_TYPE_CACHE = 0, 440 MEM_CGROUP_CHARGE_TYPE_ANON, 441 MEM_CGROUP_CHARGE_TYPE_SWAPOUT, /* for accounting swapcache */ 442 MEM_CGROUP_CHARGE_TYPE_DROP, /* a page was unused swap cache */ 443 NR_CHARGE_TYPE, 444 }; 445 446 /* for encoding cft->private value on file */ 447 enum res_type { 448 _MEM, 449 _MEMSWAP, 450 _OOM_TYPE, 451 _KMEM, 452 }; 453 454 #define MEMFILE_PRIVATE(x, val) ((x) << 16 | (val)) 455 #define MEMFILE_TYPE(val) ((val) >> 16 & 0xffff) 456 #define MEMFILE_ATTR(val) ((val) & 0xffff) 457 /* Used for OOM nofiier */ 458 #define OOM_CONTROL (0) 459 460 /* 461 * Reclaim flags for mem_cgroup_hierarchical_reclaim 462 */ 463 #define MEM_CGROUP_RECLAIM_NOSWAP_BIT 0x0 464 #define MEM_CGROUP_RECLAIM_NOSWAP (1 << MEM_CGROUP_RECLAIM_NOSWAP_BIT) 465 #define MEM_CGROUP_RECLAIM_SHRINK_BIT 0x1 466 #define MEM_CGROUP_RECLAIM_SHRINK (1 << MEM_CGROUP_RECLAIM_SHRINK_BIT) 467 468 /* 469 * The memcg_create_mutex will be held whenever a new cgroup is created. 470 * As a consequence, any change that needs to protect against new child cgroups 471 * appearing has to hold it as well. 472 */ 473 static DEFINE_MUTEX(memcg_create_mutex); 474 475 struct mem_cgroup *mem_cgroup_from_css(struct cgroup_subsys_state *s) 476 { 477 return s ? container_of(s, struct mem_cgroup, css) : NULL; 478 } 479 480 /* Some nice accessors for the vmpressure. */ 481 struct vmpressure *memcg_to_vmpressure(struct mem_cgroup *memcg) 482 { 483 if (!memcg) 484 memcg = root_mem_cgroup; 485 return &memcg->vmpressure; 486 } 487 488 struct cgroup_subsys_state *vmpressure_to_css(struct vmpressure *vmpr) 489 { 490 return &container_of(vmpr, struct mem_cgroup, vmpressure)->css; 491 } 492 493 struct vmpressure *css_to_vmpressure(struct cgroup_subsys_state *css) 494 { 495 return &mem_cgroup_from_css(css)->vmpressure; 496 } 497 498 static inline bool mem_cgroup_is_root(struct mem_cgroup *memcg) 499 { 500 return (memcg == root_mem_cgroup); 501 } 502 503 /* 504 * We restrict the id in the range of [1, 65535], so it can fit into 505 * an unsigned short. 506 */ 507 #define MEM_CGROUP_ID_MAX USHRT_MAX 508 509 static inline unsigned short mem_cgroup_id(struct mem_cgroup *memcg) 510 { 511 /* 512 * The ID of the root cgroup is 0, but memcg treat 0 as an 513 * invalid ID, so we return (cgroup_id + 1). 514 */ 515 return memcg->css.cgroup->id + 1; 516 } 517 518 static inline struct mem_cgroup *mem_cgroup_from_id(unsigned short id) 519 { 520 struct cgroup_subsys_state *css; 521 522 css = css_from_id(id - 1, &mem_cgroup_subsys); 523 return mem_cgroup_from_css(css); 524 } 525 526 /* Writing them here to avoid exposing memcg's inner layout */ 527 #if defined(CONFIG_INET) && defined(CONFIG_MEMCG_KMEM) 528 529 void sock_update_memcg(struct sock *sk) 530 { 531 if (mem_cgroup_sockets_enabled) { 532 struct mem_cgroup *memcg; 533 struct cg_proto *cg_proto; 534 535 BUG_ON(!sk->sk_prot->proto_cgroup); 536 537 /* Socket cloning can throw us here with sk_cgrp already 538 * filled. It won't however, necessarily happen from 539 * process context. So the test for root memcg given 540 * the current task's memcg won't help us in this case. 541 * 542 * Respecting the original socket's memcg is a better 543 * decision in this case. 544 */ 545 if (sk->sk_cgrp) { 546 BUG_ON(mem_cgroup_is_root(sk->sk_cgrp->memcg)); 547 css_get(&sk->sk_cgrp->memcg->css); 548 return; 549 } 550 551 rcu_read_lock(); 552 memcg = mem_cgroup_from_task(current); 553 cg_proto = sk->sk_prot->proto_cgroup(memcg); 554 if (!mem_cgroup_is_root(memcg) && 555 memcg_proto_active(cg_proto) && css_tryget(&memcg->css)) { 556 sk->sk_cgrp = cg_proto; 557 } 558 rcu_read_unlock(); 559 } 560 } 561 EXPORT_SYMBOL(sock_update_memcg); 562 563 void sock_release_memcg(struct sock *sk) 564 { 565 if (mem_cgroup_sockets_enabled && sk->sk_cgrp) { 566 struct mem_cgroup *memcg; 567 WARN_ON(!sk->sk_cgrp->memcg); 568 memcg = sk->sk_cgrp->memcg; 569 css_put(&sk->sk_cgrp->memcg->css); 570 } 571 } 572 573 struct cg_proto *tcp_proto_cgroup(struct mem_cgroup *memcg) 574 { 575 if (!memcg || mem_cgroup_is_root(memcg)) 576 return NULL; 577 578 return &memcg->tcp_mem; 579 } 580 EXPORT_SYMBOL(tcp_proto_cgroup); 581 582 static void disarm_sock_keys(struct mem_cgroup *memcg) 583 { 584 if (!memcg_proto_activated(&memcg->tcp_mem)) 585 return; 586 static_key_slow_dec(&memcg_socket_limit_enabled); 587 } 588 #else 589 static void disarm_sock_keys(struct mem_cgroup *memcg) 590 { 591 } 592 #endif 593 594 #ifdef CONFIG_MEMCG_KMEM 595 /* 596 * This will be the memcg's index in each cache's ->memcg_params->memcg_caches. 597 * The main reason for not using cgroup id for this: 598 * this works better in sparse environments, where we have a lot of memcgs, 599 * but only a few kmem-limited. Or also, if we have, for instance, 200 600 * memcgs, and none but the 200th is kmem-limited, we'd have to have a 601 * 200 entry array for that. 602 * 603 * The current size of the caches array is stored in 604 * memcg_limited_groups_array_size. It will double each time we have to 605 * increase it. 606 */ 607 static DEFINE_IDA(kmem_limited_groups); 608 int memcg_limited_groups_array_size; 609 610 /* 611 * MIN_SIZE is different than 1, because we would like to avoid going through 612 * the alloc/free process all the time. In a small machine, 4 kmem-limited 613 * cgroups is a reasonable guess. In the future, it could be a parameter or 614 * tunable, but that is strictly not necessary. 615 * 616 * MAX_SIZE should be as large as the number of cgrp_ids. Ideally, we could get 617 * this constant directly from cgroup, but it is understandable that this is 618 * better kept as an internal representation in cgroup.c. In any case, the 619 * cgrp_id space is not getting any smaller, and we don't have to necessarily 620 * increase ours as well if it increases. 621 */ 622 #define MEMCG_CACHES_MIN_SIZE 4 623 #define MEMCG_CACHES_MAX_SIZE MEM_CGROUP_ID_MAX 624 625 /* 626 * A lot of the calls to the cache allocation functions are expected to be 627 * inlined by the compiler. Since the calls to memcg_kmem_get_cache are 628 * conditional to this static branch, we'll have to allow modules that does 629 * kmem_cache_alloc and the such to see this symbol as well 630 */ 631 struct static_key memcg_kmem_enabled_key; 632 EXPORT_SYMBOL(memcg_kmem_enabled_key); 633 634 static void disarm_kmem_keys(struct mem_cgroup *memcg) 635 { 636 if (memcg_kmem_is_active(memcg)) { 637 static_key_slow_dec(&memcg_kmem_enabled_key); 638 ida_simple_remove(&kmem_limited_groups, memcg->kmemcg_id); 639 } 640 /* 641 * This check can't live in kmem destruction function, 642 * since the charges will outlive the cgroup 643 */ 644 WARN_ON(res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0); 645 } 646 #else 647 static void disarm_kmem_keys(struct mem_cgroup *memcg) 648 { 649 } 650 #endif /* CONFIG_MEMCG_KMEM */ 651 652 static void disarm_static_keys(struct mem_cgroup *memcg) 653 { 654 disarm_sock_keys(memcg); 655 disarm_kmem_keys(memcg); 656 } 657 658 static void drain_all_stock_async(struct mem_cgroup *memcg); 659 660 static struct mem_cgroup_per_zone * 661 mem_cgroup_zoneinfo(struct mem_cgroup *memcg, int nid, int zid) 662 { 663 VM_BUG_ON((unsigned)nid >= nr_node_ids); 664 return &memcg->nodeinfo[nid]->zoneinfo[zid]; 665 } 666 667 struct cgroup_subsys_state *mem_cgroup_css(struct mem_cgroup *memcg) 668 { 669 return &memcg->css; 670 } 671 672 static struct mem_cgroup_per_zone * 673 page_cgroup_zoneinfo(struct mem_cgroup *memcg, struct page *page) 674 { 675 int nid = page_to_nid(page); 676 int zid = page_zonenum(page); 677 678 return mem_cgroup_zoneinfo(memcg, nid, zid); 679 } 680 681 static struct mem_cgroup_tree_per_zone * 682 soft_limit_tree_node_zone(int nid, int zid) 683 { 684 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 685 } 686 687 static struct mem_cgroup_tree_per_zone * 688 soft_limit_tree_from_page(struct page *page) 689 { 690 int nid = page_to_nid(page); 691 int zid = page_zonenum(page); 692 693 return &soft_limit_tree.rb_tree_per_node[nid]->rb_tree_per_zone[zid]; 694 } 695 696 static void 697 __mem_cgroup_insert_exceeded(struct mem_cgroup *memcg, 698 struct mem_cgroup_per_zone *mz, 699 struct mem_cgroup_tree_per_zone *mctz, 700 unsigned long long new_usage_in_excess) 701 { 702 struct rb_node **p = &mctz->rb_root.rb_node; 703 struct rb_node *parent = NULL; 704 struct mem_cgroup_per_zone *mz_node; 705 706 if (mz->on_tree) 707 return; 708 709 mz->usage_in_excess = new_usage_in_excess; 710 if (!mz->usage_in_excess) 711 return; 712 while (*p) { 713 parent = *p; 714 mz_node = rb_entry(parent, struct mem_cgroup_per_zone, 715 tree_node); 716 if (mz->usage_in_excess < mz_node->usage_in_excess) 717 p = &(*p)->rb_left; 718 /* 719 * We can't avoid mem cgroups that are over their soft 720 * limit by the same amount 721 */ 722 else if (mz->usage_in_excess >= mz_node->usage_in_excess) 723 p = &(*p)->rb_right; 724 } 725 rb_link_node(&mz->tree_node, parent, p); 726 rb_insert_color(&mz->tree_node, &mctz->rb_root); 727 mz->on_tree = true; 728 } 729 730 static void 731 __mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 732 struct mem_cgroup_per_zone *mz, 733 struct mem_cgroup_tree_per_zone *mctz) 734 { 735 if (!mz->on_tree) 736 return; 737 rb_erase(&mz->tree_node, &mctz->rb_root); 738 mz->on_tree = false; 739 } 740 741 static void 742 mem_cgroup_remove_exceeded(struct mem_cgroup *memcg, 743 struct mem_cgroup_per_zone *mz, 744 struct mem_cgroup_tree_per_zone *mctz) 745 { 746 spin_lock(&mctz->lock); 747 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 748 spin_unlock(&mctz->lock); 749 } 750 751 752 static void mem_cgroup_update_tree(struct mem_cgroup *memcg, struct page *page) 753 { 754 unsigned long long excess; 755 struct mem_cgroup_per_zone *mz; 756 struct mem_cgroup_tree_per_zone *mctz; 757 int nid = page_to_nid(page); 758 int zid = page_zonenum(page); 759 mctz = soft_limit_tree_from_page(page); 760 761 /* 762 * Necessary to update all ancestors when hierarchy is used. 763 * because their event counter is not touched. 764 */ 765 for (; memcg; memcg = parent_mem_cgroup(memcg)) { 766 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 767 excess = res_counter_soft_limit_excess(&memcg->res); 768 /* 769 * We have to update the tree if mz is on RB-tree or 770 * mem is over its softlimit. 771 */ 772 if (excess || mz->on_tree) { 773 spin_lock(&mctz->lock); 774 /* if on-tree, remove it */ 775 if (mz->on_tree) 776 __mem_cgroup_remove_exceeded(memcg, mz, mctz); 777 /* 778 * Insert again. mz->usage_in_excess will be updated. 779 * If excess is 0, no tree ops. 780 */ 781 __mem_cgroup_insert_exceeded(memcg, mz, mctz, excess); 782 spin_unlock(&mctz->lock); 783 } 784 } 785 } 786 787 static void mem_cgroup_remove_from_trees(struct mem_cgroup *memcg) 788 { 789 int node, zone; 790 struct mem_cgroup_per_zone *mz; 791 struct mem_cgroup_tree_per_zone *mctz; 792 793 for_each_node(node) { 794 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 795 mz = mem_cgroup_zoneinfo(memcg, node, zone); 796 mctz = soft_limit_tree_node_zone(node, zone); 797 mem_cgroup_remove_exceeded(memcg, mz, mctz); 798 } 799 } 800 } 801 802 static struct mem_cgroup_per_zone * 803 __mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 804 { 805 struct rb_node *rightmost = NULL; 806 struct mem_cgroup_per_zone *mz; 807 808 retry: 809 mz = NULL; 810 rightmost = rb_last(&mctz->rb_root); 811 if (!rightmost) 812 goto done; /* Nothing to reclaim from */ 813 814 mz = rb_entry(rightmost, struct mem_cgroup_per_zone, tree_node); 815 /* 816 * Remove the node now but someone else can add it back, 817 * we will to add it back at the end of reclaim to its correct 818 * position in the tree. 819 */ 820 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 821 if (!res_counter_soft_limit_excess(&mz->memcg->res) || 822 !css_tryget(&mz->memcg->css)) 823 goto retry; 824 done: 825 return mz; 826 } 827 828 static struct mem_cgroup_per_zone * 829 mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_zone *mctz) 830 { 831 struct mem_cgroup_per_zone *mz; 832 833 spin_lock(&mctz->lock); 834 mz = __mem_cgroup_largest_soft_limit_node(mctz); 835 spin_unlock(&mctz->lock); 836 return mz; 837 } 838 839 /* 840 * Implementation Note: reading percpu statistics for memcg. 841 * 842 * Both of vmstat[] and percpu_counter has threshold and do periodic 843 * synchronization to implement "quick" read. There are trade-off between 844 * reading cost and precision of value. Then, we may have a chance to implement 845 * a periodic synchronizion of counter in memcg's counter. 846 * 847 * But this _read() function is used for user interface now. The user accounts 848 * memory usage by memory cgroup and he _always_ requires exact value because 849 * he accounts memory. Even if we provide quick-and-fuzzy read, we always 850 * have to visit all online cpus and make sum. So, for now, unnecessary 851 * synchronization is not implemented. (just implemented for cpu hotplug) 852 * 853 * If there are kernel internal actions which can make use of some not-exact 854 * value, and reading all cpu value can be performance bottleneck in some 855 * common workload, threashold and synchonization as vmstat[] should be 856 * implemented. 857 */ 858 static long mem_cgroup_read_stat(struct mem_cgroup *memcg, 859 enum mem_cgroup_stat_index idx) 860 { 861 long val = 0; 862 int cpu; 863 864 get_online_cpus(); 865 for_each_online_cpu(cpu) 866 val += per_cpu(memcg->stat->count[idx], cpu); 867 #ifdef CONFIG_HOTPLUG_CPU 868 spin_lock(&memcg->pcp_counter_lock); 869 val += memcg->nocpu_base.count[idx]; 870 spin_unlock(&memcg->pcp_counter_lock); 871 #endif 872 put_online_cpus(); 873 return val; 874 } 875 876 static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, 877 bool charge) 878 { 879 int val = (charge) ? 1 : -1; 880 this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_SWAP], val); 881 } 882 883 static unsigned long mem_cgroup_read_events(struct mem_cgroup *memcg, 884 enum mem_cgroup_events_index idx) 885 { 886 unsigned long val = 0; 887 int cpu; 888 889 get_online_cpus(); 890 for_each_online_cpu(cpu) 891 val += per_cpu(memcg->stat->events[idx], cpu); 892 #ifdef CONFIG_HOTPLUG_CPU 893 spin_lock(&memcg->pcp_counter_lock); 894 val += memcg->nocpu_base.events[idx]; 895 spin_unlock(&memcg->pcp_counter_lock); 896 #endif 897 put_online_cpus(); 898 return val; 899 } 900 901 static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, 902 struct page *page, 903 bool anon, int nr_pages) 904 { 905 preempt_disable(); 906 907 /* 908 * Here, RSS means 'mapped anon' and anon's SwapCache. Shmem/tmpfs is 909 * counted as CACHE even if it's on ANON LRU. 910 */ 911 if (anon) 912 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS], 913 nr_pages); 914 else 915 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_CACHE], 916 nr_pages); 917 918 if (PageTransHuge(page)) 919 __this_cpu_add(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 920 nr_pages); 921 922 /* pagein of a big page is an event. So, ignore page size */ 923 if (nr_pages > 0) 924 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGIN]); 925 else { 926 __this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGPGOUT]); 927 nr_pages = -nr_pages; /* for event */ 928 } 929 930 __this_cpu_add(memcg->stat->nr_page_events, nr_pages); 931 932 preempt_enable(); 933 } 934 935 unsigned long 936 mem_cgroup_get_lru_size(struct lruvec *lruvec, enum lru_list lru) 937 { 938 struct mem_cgroup_per_zone *mz; 939 940 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 941 return mz->lru_size[lru]; 942 } 943 944 static unsigned long 945 mem_cgroup_zone_nr_lru_pages(struct mem_cgroup *memcg, int nid, int zid, 946 unsigned int lru_mask) 947 { 948 struct mem_cgroup_per_zone *mz; 949 enum lru_list lru; 950 unsigned long ret = 0; 951 952 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 953 954 for_each_lru(lru) { 955 if (BIT(lru) & lru_mask) 956 ret += mz->lru_size[lru]; 957 } 958 return ret; 959 } 960 961 static unsigned long 962 mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, 963 int nid, unsigned int lru_mask) 964 { 965 u64 total = 0; 966 int zid; 967 968 for (zid = 0; zid < MAX_NR_ZONES; zid++) 969 total += mem_cgroup_zone_nr_lru_pages(memcg, 970 nid, zid, lru_mask); 971 972 return total; 973 } 974 975 static unsigned long mem_cgroup_nr_lru_pages(struct mem_cgroup *memcg, 976 unsigned int lru_mask) 977 { 978 int nid; 979 u64 total = 0; 980 981 for_each_node_state(nid, N_MEMORY) 982 total += mem_cgroup_node_nr_lru_pages(memcg, nid, lru_mask); 983 return total; 984 } 985 986 static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, 987 enum mem_cgroup_events_target target) 988 { 989 unsigned long val, next; 990 991 val = __this_cpu_read(memcg->stat->nr_page_events); 992 next = __this_cpu_read(memcg->stat->targets[target]); 993 /* from time_after() in jiffies.h */ 994 if ((long)next - (long)val < 0) { 995 switch (target) { 996 case MEM_CGROUP_TARGET_THRESH: 997 next = val + THRESHOLDS_EVENTS_TARGET; 998 break; 999 case MEM_CGROUP_TARGET_SOFTLIMIT: 1000 next = val + SOFTLIMIT_EVENTS_TARGET; 1001 break; 1002 case MEM_CGROUP_TARGET_NUMAINFO: 1003 next = val + NUMAINFO_EVENTS_TARGET; 1004 break; 1005 default: 1006 break; 1007 } 1008 __this_cpu_write(memcg->stat->targets[target], next); 1009 return true; 1010 } 1011 return false; 1012 } 1013 1014 /* 1015 * Check events in order. 1016 * 1017 */ 1018 static void memcg_check_events(struct mem_cgroup *memcg, struct page *page) 1019 { 1020 preempt_disable(); 1021 /* threshold event is triggered in finer grain than soft limit */ 1022 if (unlikely(mem_cgroup_event_ratelimit(memcg, 1023 MEM_CGROUP_TARGET_THRESH))) { 1024 bool do_softlimit; 1025 bool do_numainfo __maybe_unused; 1026 1027 do_softlimit = mem_cgroup_event_ratelimit(memcg, 1028 MEM_CGROUP_TARGET_SOFTLIMIT); 1029 #if MAX_NUMNODES > 1 1030 do_numainfo = mem_cgroup_event_ratelimit(memcg, 1031 MEM_CGROUP_TARGET_NUMAINFO); 1032 #endif 1033 preempt_enable(); 1034 1035 mem_cgroup_threshold(memcg); 1036 if (unlikely(do_softlimit)) 1037 mem_cgroup_update_tree(memcg, page); 1038 #if MAX_NUMNODES > 1 1039 if (unlikely(do_numainfo)) 1040 atomic_inc(&memcg->numainfo_events); 1041 #endif 1042 } else 1043 preempt_enable(); 1044 } 1045 1046 struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) 1047 { 1048 /* 1049 * mm_update_next_owner() may clear mm->owner to NULL 1050 * if it races with swapoff, page migration, etc. 1051 * So this can be called with p == NULL. 1052 */ 1053 if (unlikely(!p)) 1054 return NULL; 1055 1056 return mem_cgroup_from_css(task_css(p, mem_cgroup_subsys_id)); 1057 } 1058 1059 struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) 1060 { 1061 struct mem_cgroup *memcg = NULL; 1062 1063 if (!mm) 1064 return NULL; 1065 /* 1066 * Because we have no locks, mm->owner's may be being moved to other 1067 * cgroup. We use css_tryget() here even if this looks 1068 * pessimistic (rather than adding locks here). 1069 */ 1070 rcu_read_lock(); 1071 do { 1072 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1073 if (unlikely(!memcg)) 1074 break; 1075 } while (!css_tryget(&memcg->css)); 1076 rcu_read_unlock(); 1077 return memcg; 1078 } 1079 1080 /* 1081 * Returns a next (in a pre-order walk) alive memcg (with elevated css 1082 * ref. count) or NULL if the whole root's subtree has been visited. 1083 * 1084 * helper function to be used by mem_cgroup_iter 1085 */ 1086 static struct mem_cgroup *__mem_cgroup_iter_next(struct mem_cgroup *root, 1087 struct mem_cgroup *last_visited) 1088 { 1089 struct cgroup_subsys_state *prev_css, *next_css; 1090 1091 prev_css = last_visited ? &last_visited->css : NULL; 1092 skip_node: 1093 next_css = css_next_descendant_pre(prev_css, &root->css); 1094 1095 /* 1096 * Even if we found a group we have to make sure it is 1097 * alive. css && !memcg means that the groups should be 1098 * skipped and we should continue the tree walk. 1099 * last_visited css is safe to use because it is 1100 * protected by css_get and the tree walk is rcu safe. 1101 * 1102 * We do not take a reference on the root of the tree walk 1103 * because we might race with the root removal when it would 1104 * be the only node in the iterated hierarchy and mem_cgroup_iter 1105 * would end up in an endless loop because it expects that at 1106 * least one valid node will be returned. Root cannot disappear 1107 * because caller of the iterator should hold it already so 1108 * skipping css reference should be safe. 1109 */ 1110 if (next_css) { 1111 if ((next_css == &root->css) || 1112 ((next_css->flags & CSS_ONLINE) && css_tryget(next_css))) 1113 return mem_cgroup_from_css(next_css); 1114 1115 prev_css = next_css; 1116 goto skip_node; 1117 } 1118 1119 return NULL; 1120 } 1121 1122 static void mem_cgroup_iter_invalidate(struct mem_cgroup *root) 1123 { 1124 /* 1125 * When a group in the hierarchy below root is destroyed, the 1126 * hierarchy iterator can no longer be trusted since it might 1127 * have pointed to the destroyed group. Invalidate it. 1128 */ 1129 atomic_inc(&root->dead_count); 1130 } 1131 1132 static struct mem_cgroup * 1133 mem_cgroup_iter_load(struct mem_cgroup_reclaim_iter *iter, 1134 struct mem_cgroup *root, 1135 int *sequence) 1136 { 1137 struct mem_cgroup *position = NULL; 1138 /* 1139 * A cgroup destruction happens in two stages: offlining and 1140 * release. They are separated by a RCU grace period. 1141 * 1142 * If the iterator is valid, we may still race with an 1143 * offlining. The RCU lock ensures the object won't be 1144 * released, tryget will fail if we lost the race. 1145 */ 1146 *sequence = atomic_read(&root->dead_count); 1147 if (iter->last_dead_count == *sequence) { 1148 smp_rmb(); 1149 position = iter->last_visited; 1150 1151 /* 1152 * We cannot take a reference to root because we might race 1153 * with root removal and returning NULL would end up in 1154 * an endless loop on the iterator user level when root 1155 * would be returned all the time. 1156 */ 1157 if (position && position != root && 1158 !css_tryget(&position->css)) 1159 position = NULL; 1160 } 1161 return position; 1162 } 1163 1164 static void mem_cgroup_iter_update(struct mem_cgroup_reclaim_iter *iter, 1165 struct mem_cgroup *last_visited, 1166 struct mem_cgroup *new_position, 1167 struct mem_cgroup *root, 1168 int sequence) 1169 { 1170 /* root reference counting symmetric to mem_cgroup_iter_load */ 1171 if (last_visited && last_visited != root) 1172 css_put(&last_visited->css); 1173 /* 1174 * We store the sequence count from the time @last_visited was 1175 * loaded successfully instead of rereading it here so that we 1176 * don't lose destruction events in between. We could have 1177 * raced with the destruction of @new_position after all. 1178 */ 1179 iter->last_visited = new_position; 1180 smp_wmb(); 1181 iter->last_dead_count = sequence; 1182 } 1183 1184 /** 1185 * mem_cgroup_iter - iterate over memory cgroup hierarchy 1186 * @root: hierarchy root 1187 * @prev: previously returned memcg, NULL on first invocation 1188 * @reclaim: cookie for shared reclaim walks, NULL for full walks 1189 * 1190 * Returns references to children of the hierarchy below @root, or 1191 * @root itself, or %NULL after a full round-trip. 1192 * 1193 * Caller must pass the return value in @prev on subsequent 1194 * invocations for reference counting, or use mem_cgroup_iter_break() 1195 * to cancel a hierarchy walk before the round-trip is complete. 1196 * 1197 * Reclaimers can specify a zone and a priority level in @reclaim to 1198 * divide up the memcgs in the hierarchy among all concurrent 1199 * reclaimers operating on the same zone and priority. 1200 */ 1201 struct mem_cgroup *mem_cgroup_iter(struct mem_cgroup *root, 1202 struct mem_cgroup *prev, 1203 struct mem_cgroup_reclaim_cookie *reclaim) 1204 { 1205 struct mem_cgroup *memcg = NULL; 1206 struct mem_cgroup *last_visited = NULL; 1207 1208 if (mem_cgroup_disabled()) 1209 return NULL; 1210 1211 if (!root) 1212 root = root_mem_cgroup; 1213 1214 if (prev && !reclaim) 1215 last_visited = prev; 1216 1217 if (!root->use_hierarchy && root != root_mem_cgroup) { 1218 if (prev) 1219 goto out_css_put; 1220 return root; 1221 } 1222 1223 rcu_read_lock(); 1224 while (!memcg) { 1225 struct mem_cgroup_reclaim_iter *uninitialized_var(iter); 1226 int uninitialized_var(seq); 1227 1228 if (reclaim) { 1229 int nid = zone_to_nid(reclaim->zone); 1230 int zid = zone_idx(reclaim->zone); 1231 struct mem_cgroup_per_zone *mz; 1232 1233 mz = mem_cgroup_zoneinfo(root, nid, zid); 1234 iter = &mz->reclaim_iter[reclaim->priority]; 1235 if (prev && reclaim->generation != iter->generation) { 1236 iter->last_visited = NULL; 1237 goto out_unlock; 1238 } 1239 1240 last_visited = mem_cgroup_iter_load(iter, root, &seq); 1241 } 1242 1243 memcg = __mem_cgroup_iter_next(root, last_visited); 1244 1245 if (reclaim) { 1246 mem_cgroup_iter_update(iter, last_visited, memcg, root, 1247 seq); 1248 1249 if (!memcg) 1250 iter->generation++; 1251 else if (!prev && memcg) 1252 reclaim->generation = iter->generation; 1253 } 1254 1255 if (prev && !memcg) 1256 goto out_unlock; 1257 } 1258 out_unlock: 1259 rcu_read_unlock(); 1260 out_css_put: 1261 if (prev && prev != root) 1262 css_put(&prev->css); 1263 1264 return memcg; 1265 } 1266 1267 /** 1268 * mem_cgroup_iter_break - abort a hierarchy walk prematurely 1269 * @root: hierarchy root 1270 * @prev: last visited hierarchy member as returned by mem_cgroup_iter() 1271 */ 1272 void mem_cgroup_iter_break(struct mem_cgroup *root, 1273 struct mem_cgroup *prev) 1274 { 1275 if (!root) 1276 root = root_mem_cgroup; 1277 if (prev && prev != root) 1278 css_put(&prev->css); 1279 } 1280 1281 /* 1282 * Iteration constructs for visiting all cgroups (under a tree). If 1283 * loops are exited prematurely (break), mem_cgroup_iter_break() must 1284 * be used for reference counting. 1285 */ 1286 #define for_each_mem_cgroup_tree(iter, root) \ 1287 for (iter = mem_cgroup_iter(root, NULL, NULL); \ 1288 iter != NULL; \ 1289 iter = mem_cgroup_iter(root, iter, NULL)) 1290 1291 #define for_each_mem_cgroup(iter) \ 1292 for (iter = mem_cgroup_iter(NULL, NULL, NULL); \ 1293 iter != NULL; \ 1294 iter = mem_cgroup_iter(NULL, iter, NULL)) 1295 1296 void __mem_cgroup_count_vm_event(struct mm_struct *mm, enum vm_event_item idx) 1297 { 1298 struct mem_cgroup *memcg; 1299 1300 rcu_read_lock(); 1301 memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); 1302 if (unlikely(!memcg)) 1303 goto out; 1304 1305 switch (idx) { 1306 case PGFAULT: 1307 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGFAULT]); 1308 break; 1309 case PGMAJFAULT: 1310 this_cpu_inc(memcg->stat->events[MEM_CGROUP_EVENTS_PGMAJFAULT]); 1311 break; 1312 default: 1313 BUG(); 1314 } 1315 out: 1316 rcu_read_unlock(); 1317 } 1318 EXPORT_SYMBOL(__mem_cgroup_count_vm_event); 1319 1320 /** 1321 * mem_cgroup_zone_lruvec - get the lru list vector for a zone and memcg 1322 * @zone: zone of the wanted lruvec 1323 * @memcg: memcg of the wanted lruvec 1324 * 1325 * Returns the lru list vector holding pages for the given @zone and 1326 * @mem. This can be the global zone lruvec, if the memory controller 1327 * is disabled. 1328 */ 1329 struct lruvec *mem_cgroup_zone_lruvec(struct zone *zone, 1330 struct mem_cgroup *memcg) 1331 { 1332 struct mem_cgroup_per_zone *mz; 1333 struct lruvec *lruvec; 1334 1335 if (mem_cgroup_disabled()) { 1336 lruvec = &zone->lruvec; 1337 goto out; 1338 } 1339 1340 mz = mem_cgroup_zoneinfo(memcg, zone_to_nid(zone), zone_idx(zone)); 1341 lruvec = &mz->lruvec; 1342 out: 1343 /* 1344 * Since a node can be onlined after the mem_cgroup was created, 1345 * we have to be prepared to initialize lruvec->zone here; 1346 * and if offlined then reonlined, we need to reinitialize it. 1347 */ 1348 if (unlikely(lruvec->zone != zone)) 1349 lruvec->zone = zone; 1350 return lruvec; 1351 } 1352 1353 /* 1354 * Following LRU functions are allowed to be used without PCG_LOCK. 1355 * Operations are called by routine of global LRU independently from memcg. 1356 * What we have to take care of here is validness of pc->mem_cgroup. 1357 * 1358 * Changes to pc->mem_cgroup happens when 1359 * 1. charge 1360 * 2. moving account 1361 * In typical case, "charge" is done before add-to-lru. Exception is SwapCache. 1362 * It is added to LRU before charge. 1363 * If PCG_USED bit is not set, page_cgroup is not added to this private LRU. 1364 * When moving account, the page is not on LRU. It's isolated. 1365 */ 1366 1367 /** 1368 * mem_cgroup_page_lruvec - return lruvec for adding an lru page 1369 * @page: the page 1370 * @zone: zone of the page 1371 */ 1372 struct lruvec *mem_cgroup_page_lruvec(struct page *page, struct zone *zone) 1373 { 1374 struct mem_cgroup_per_zone *mz; 1375 struct mem_cgroup *memcg; 1376 struct page_cgroup *pc; 1377 struct lruvec *lruvec; 1378 1379 if (mem_cgroup_disabled()) { 1380 lruvec = &zone->lruvec; 1381 goto out; 1382 } 1383 1384 pc = lookup_page_cgroup(page); 1385 memcg = pc->mem_cgroup; 1386 1387 /* 1388 * Surreptitiously switch any uncharged offlist page to root: 1389 * an uncharged page off lru does nothing to secure 1390 * its former mem_cgroup from sudden removal. 1391 * 1392 * Our caller holds lru_lock, and PageCgroupUsed is updated 1393 * under page_cgroup lock: between them, they make all uses 1394 * of pc->mem_cgroup safe. 1395 */ 1396 if (!PageLRU(page) && !PageCgroupUsed(pc) && memcg != root_mem_cgroup) 1397 pc->mem_cgroup = memcg = root_mem_cgroup; 1398 1399 mz = page_cgroup_zoneinfo(memcg, page); 1400 lruvec = &mz->lruvec; 1401 out: 1402 /* 1403 * Since a node can be onlined after the mem_cgroup was created, 1404 * we have to be prepared to initialize lruvec->zone here; 1405 * and if offlined then reonlined, we need to reinitialize it. 1406 */ 1407 if (unlikely(lruvec->zone != zone)) 1408 lruvec->zone = zone; 1409 return lruvec; 1410 } 1411 1412 /** 1413 * mem_cgroup_update_lru_size - account for adding or removing an lru page 1414 * @lruvec: mem_cgroup per zone lru vector 1415 * @lru: index of lru list the page is sitting on 1416 * @nr_pages: positive when adding or negative when removing 1417 * 1418 * This function must be called when a page is added to or removed from an 1419 * lru list. 1420 */ 1421 void mem_cgroup_update_lru_size(struct lruvec *lruvec, enum lru_list lru, 1422 int nr_pages) 1423 { 1424 struct mem_cgroup_per_zone *mz; 1425 unsigned long *lru_size; 1426 1427 if (mem_cgroup_disabled()) 1428 return; 1429 1430 mz = container_of(lruvec, struct mem_cgroup_per_zone, lruvec); 1431 lru_size = mz->lru_size + lru; 1432 *lru_size += nr_pages; 1433 VM_BUG_ON((long)(*lru_size) < 0); 1434 } 1435 1436 /* 1437 * Checks whether given mem is same or in the root_mem_cgroup's 1438 * hierarchy subtree 1439 */ 1440 bool __mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1441 struct mem_cgroup *memcg) 1442 { 1443 if (root_memcg == memcg) 1444 return true; 1445 if (!root_memcg->use_hierarchy || !memcg) 1446 return false; 1447 return cgroup_is_descendant(memcg->css.cgroup, root_memcg->css.cgroup); 1448 } 1449 1450 static bool mem_cgroup_same_or_subtree(const struct mem_cgroup *root_memcg, 1451 struct mem_cgroup *memcg) 1452 { 1453 bool ret; 1454 1455 rcu_read_lock(); 1456 ret = __mem_cgroup_same_or_subtree(root_memcg, memcg); 1457 rcu_read_unlock(); 1458 return ret; 1459 } 1460 1461 bool task_in_mem_cgroup(struct task_struct *task, 1462 const struct mem_cgroup *memcg) 1463 { 1464 struct mem_cgroup *curr = NULL; 1465 struct task_struct *p; 1466 bool ret; 1467 1468 p = find_lock_task_mm(task); 1469 if (p) { 1470 curr = try_get_mem_cgroup_from_mm(p->mm); 1471 task_unlock(p); 1472 } else { 1473 /* 1474 * All threads may have already detached their mm's, but the oom 1475 * killer still needs to detect if they have already been oom 1476 * killed to prevent needlessly killing additional tasks. 1477 */ 1478 rcu_read_lock(); 1479 curr = mem_cgroup_from_task(task); 1480 if (curr) 1481 css_get(&curr->css); 1482 rcu_read_unlock(); 1483 } 1484 if (!curr) 1485 return false; 1486 /* 1487 * We should check use_hierarchy of "memcg" not "curr". Because checking 1488 * use_hierarchy of "curr" here make this function true if hierarchy is 1489 * enabled in "curr" and "curr" is a child of "memcg" in *cgroup* 1490 * hierarchy(even if use_hierarchy is disabled in "memcg"). 1491 */ 1492 ret = mem_cgroup_same_or_subtree(memcg, curr); 1493 css_put(&curr->css); 1494 return ret; 1495 } 1496 1497 int mem_cgroup_inactive_anon_is_low(struct lruvec *lruvec) 1498 { 1499 unsigned long inactive_ratio; 1500 unsigned long inactive; 1501 unsigned long active; 1502 unsigned long gb; 1503 1504 inactive = mem_cgroup_get_lru_size(lruvec, LRU_INACTIVE_ANON); 1505 active = mem_cgroup_get_lru_size(lruvec, LRU_ACTIVE_ANON); 1506 1507 gb = (inactive + active) >> (30 - PAGE_SHIFT); 1508 if (gb) 1509 inactive_ratio = int_sqrt(10 * gb); 1510 else 1511 inactive_ratio = 1; 1512 1513 return inactive * inactive_ratio < active; 1514 } 1515 1516 #define mem_cgroup_from_res_counter(counter, member) \ 1517 container_of(counter, struct mem_cgroup, member) 1518 1519 /** 1520 * mem_cgroup_margin - calculate chargeable space of a memory cgroup 1521 * @memcg: the memory cgroup 1522 * 1523 * Returns the maximum amount of memory @mem can be charged with, in 1524 * pages. 1525 */ 1526 static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) 1527 { 1528 unsigned long long margin; 1529 1530 margin = res_counter_margin(&memcg->res); 1531 if (do_swap_account) 1532 margin = min(margin, res_counter_margin(&memcg->memsw)); 1533 return margin >> PAGE_SHIFT; 1534 } 1535 1536 int mem_cgroup_swappiness(struct mem_cgroup *memcg) 1537 { 1538 /* root ? */ 1539 if (!css_parent(&memcg->css)) 1540 return vm_swappiness; 1541 1542 return memcg->swappiness; 1543 } 1544 1545 /* 1546 * memcg->moving_account is used for checking possibility that some thread is 1547 * calling move_account(). When a thread on CPU-A starts moving pages under 1548 * a memcg, other threads should check memcg->moving_account under 1549 * rcu_read_lock(), like this: 1550 * 1551 * CPU-A CPU-B 1552 * rcu_read_lock() 1553 * memcg->moving_account+1 if (memcg->mocing_account) 1554 * take heavy locks. 1555 * synchronize_rcu() update something. 1556 * rcu_read_unlock() 1557 * start move here. 1558 */ 1559 1560 /* for quick checking without looking up memcg */ 1561 atomic_t memcg_moving __read_mostly; 1562 1563 static void mem_cgroup_start_move(struct mem_cgroup *memcg) 1564 { 1565 atomic_inc(&memcg_moving); 1566 atomic_inc(&memcg->moving_account); 1567 synchronize_rcu(); 1568 } 1569 1570 static void mem_cgroup_end_move(struct mem_cgroup *memcg) 1571 { 1572 /* 1573 * Now, mem_cgroup_clear_mc() may call this function with NULL. 1574 * We check NULL in callee rather than caller. 1575 */ 1576 if (memcg) { 1577 atomic_dec(&memcg_moving); 1578 atomic_dec(&memcg->moving_account); 1579 } 1580 } 1581 1582 /* 1583 * 2 routines for checking "mem" is under move_account() or not. 1584 * 1585 * mem_cgroup_stolen() - checking whether a cgroup is mc.from or not. This 1586 * is used for avoiding races in accounting. If true, 1587 * pc->mem_cgroup may be overwritten. 1588 * 1589 * mem_cgroup_under_move() - checking a cgroup is mc.from or mc.to or 1590 * under hierarchy of moving cgroups. This is for 1591 * waiting at hith-memory prressure caused by "move". 1592 */ 1593 1594 static bool mem_cgroup_stolen(struct mem_cgroup *memcg) 1595 { 1596 VM_BUG_ON(!rcu_read_lock_held()); 1597 return atomic_read(&memcg->moving_account) > 0; 1598 } 1599 1600 static bool mem_cgroup_under_move(struct mem_cgroup *memcg) 1601 { 1602 struct mem_cgroup *from; 1603 struct mem_cgroup *to; 1604 bool ret = false; 1605 /* 1606 * Unlike task_move routines, we access mc.to, mc.from not under 1607 * mutual exclusion by cgroup_mutex. Here, we take spinlock instead. 1608 */ 1609 spin_lock(&mc.lock); 1610 from = mc.from; 1611 to = mc.to; 1612 if (!from) 1613 goto unlock; 1614 1615 ret = mem_cgroup_same_or_subtree(memcg, from) 1616 || mem_cgroup_same_or_subtree(memcg, to); 1617 unlock: 1618 spin_unlock(&mc.lock); 1619 return ret; 1620 } 1621 1622 static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) 1623 { 1624 if (mc.moving_task && current != mc.moving_task) { 1625 if (mem_cgroup_under_move(memcg)) { 1626 DEFINE_WAIT(wait); 1627 prepare_to_wait(&mc.waitq, &wait, TASK_INTERRUPTIBLE); 1628 /* moving charge context might have finished. */ 1629 if (mc.moving_task) 1630 schedule(); 1631 finish_wait(&mc.waitq, &wait); 1632 return true; 1633 } 1634 } 1635 return false; 1636 } 1637 1638 /* 1639 * Take this lock when 1640 * - a code tries to modify page's memcg while it's USED. 1641 * - a code tries to modify page state accounting in a memcg. 1642 * see mem_cgroup_stolen(), too. 1643 */ 1644 static void move_lock_mem_cgroup(struct mem_cgroup *memcg, 1645 unsigned long *flags) 1646 { 1647 spin_lock_irqsave(&memcg->move_lock, *flags); 1648 } 1649 1650 static void move_unlock_mem_cgroup(struct mem_cgroup *memcg, 1651 unsigned long *flags) 1652 { 1653 spin_unlock_irqrestore(&memcg->move_lock, *flags); 1654 } 1655 1656 #define K(x) ((x) << (PAGE_SHIFT-10)) 1657 /** 1658 * mem_cgroup_print_oom_info: Print OOM information relevant to memory controller. 1659 * @memcg: The memory cgroup that went over limit 1660 * @p: Task that is going to be killed 1661 * 1662 * NOTE: @memcg and @p's mem_cgroup can be different when hierarchy is 1663 * enabled 1664 */ 1665 void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) 1666 { 1667 struct cgroup *task_cgrp; 1668 struct cgroup *mem_cgrp; 1669 /* 1670 * Need a buffer in BSS, can't rely on allocations. The code relies 1671 * on the assumption that OOM is serialized for memory controller. 1672 * If this assumption is broken, revisit this code. 1673 */ 1674 static char memcg_name[PATH_MAX]; 1675 int ret; 1676 struct mem_cgroup *iter; 1677 unsigned int i; 1678 1679 if (!p) 1680 return; 1681 1682 rcu_read_lock(); 1683 1684 mem_cgrp = memcg->css.cgroup; 1685 task_cgrp = task_cgroup(p, mem_cgroup_subsys_id); 1686 1687 ret = cgroup_path(task_cgrp, memcg_name, PATH_MAX); 1688 if (ret < 0) { 1689 /* 1690 * Unfortunately, we are unable to convert to a useful name 1691 * But we'll still print out the usage information 1692 */ 1693 rcu_read_unlock(); 1694 goto done; 1695 } 1696 rcu_read_unlock(); 1697 1698 pr_info("Task in %s killed", memcg_name); 1699 1700 rcu_read_lock(); 1701 ret = cgroup_path(mem_cgrp, memcg_name, PATH_MAX); 1702 if (ret < 0) { 1703 rcu_read_unlock(); 1704 goto done; 1705 } 1706 rcu_read_unlock(); 1707 1708 /* 1709 * Continues from above, so we don't need an KERN_ level 1710 */ 1711 pr_cont(" as a result of limit of %s\n", memcg_name); 1712 done: 1713 1714 pr_info("memory: usage %llukB, limit %llukB, failcnt %llu\n", 1715 res_counter_read_u64(&memcg->res, RES_USAGE) >> 10, 1716 res_counter_read_u64(&memcg->res, RES_LIMIT) >> 10, 1717 res_counter_read_u64(&memcg->res, RES_FAILCNT)); 1718 pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %llu\n", 1719 res_counter_read_u64(&memcg->memsw, RES_USAGE) >> 10, 1720 res_counter_read_u64(&memcg->memsw, RES_LIMIT) >> 10, 1721 res_counter_read_u64(&memcg->memsw, RES_FAILCNT)); 1722 pr_info("kmem: usage %llukB, limit %llukB, failcnt %llu\n", 1723 res_counter_read_u64(&memcg->kmem, RES_USAGE) >> 10, 1724 res_counter_read_u64(&memcg->kmem, RES_LIMIT) >> 10, 1725 res_counter_read_u64(&memcg->kmem, RES_FAILCNT)); 1726 1727 for_each_mem_cgroup_tree(iter, memcg) { 1728 pr_info("Memory cgroup stats"); 1729 1730 rcu_read_lock(); 1731 ret = cgroup_path(iter->css.cgroup, memcg_name, PATH_MAX); 1732 if (!ret) 1733 pr_cont(" for %s", memcg_name); 1734 rcu_read_unlock(); 1735 pr_cont(":"); 1736 1737 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 1738 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 1739 continue; 1740 pr_cont(" %s:%ldKB", mem_cgroup_stat_names[i], 1741 K(mem_cgroup_read_stat(iter, i))); 1742 } 1743 1744 for (i = 0; i < NR_LRU_LISTS; i++) 1745 pr_cont(" %s:%luKB", mem_cgroup_lru_names[i], 1746 K(mem_cgroup_nr_lru_pages(iter, BIT(i)))); 1747 1748 pr_cont("\n"); 1749 } 1750 } 1751 1752 /* 1753 * This function returns the number of memcg under hierarchy tree. Returns 1754 * 1(self count) if no children. 1755 */ 1756 static int mem_cgroup_count_children(struct mem_cgroup *memcg) 1757 { 1758 int num = 0; 1759 struct mem_cgroup *iter; 1760 1761 for_each_mem_cgroup_tree(iter, memcg) 1762 num++; 1763 return num; 1764 } 1765 1766 /* 1767 * Return the memory (and swap, if configured) limit for a memcg. 1768 */ 1769 static u64 mem_cgroup_get_limit(struct mem_cgroup *memcg) 1770 { 1771 u64 limit; 1772 1773 limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 1774 1775 /* 1776 * Do not consider swap space if we cannot swap due to swappiness 1777 */ 1778 if (mem_cgroup_swappiness(memcg)) { 1779 u64 memsw; 1780 1781 limit += total_swap_pages << PAGE_SHIFT; 1782 memsw = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 1783 1784 /* 1785 * If memsw is finite and limits the amount of swap space 1786 * available to this memcg, return that limit. 1787 */ 1788 limit = min(limit, memsw); 1789 } 1790 1791 return limit; 1792 } 1793 1794 static void mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, 1795 int order) 1796 { 1797 struct mem_cgroup *iter; 1798 unsigned long chosen_points = 0; 1799 unsigned long totalpages; 1800 unsigned int points = 0; 1801 struct task_struct *chosen = NULL; 1802 1803 /* 1804 * If current has a pending SIGKILL or is exiting, then automatically 1805 * select it. The goal is to allow it to allocate so that it may 1806 * quickly exit and free its memory. 1807 */ 1808 if (fatal_signal_pending(current) || current->flags & PF_EXITING) { 1809 set_thread_flag(TIF_MEMDIE); 1810 return; 1811 } 1812 1813 check_panic_on_oom(CONSTRAINT_MEMCG, gfp_mask, order, NULL); 1814 totalpages = mem_cgroup_get_limit(memcg) >> PAGE_SHIFT ? : 1; 1815 for_each_mem_cgroup_tree(iter, memcg) { 1816 struct css_task_iter it; 1817 struct task_struct *task; 1818 1819 css_task_iter_start(&iter->css, &it); 1820 while ((task = css_task_iter_next(&it))) { 1821 switch (oom_scan_process_thread(task, totalpages, NULL, 1822 false)) { 1823 case OOM_SCAN_SELECT: 1824 if (chosen) 1825 put_task_struct(chosen); 1826 chosen = task; 1827 chosen_points = ULONG_MAX; 1828 get_task_struct(chosen); 1829 /* fall through */ 1830 case OOM_SCAN_CONTINUE: 1831 continue; 1832 case OOM_SCAN_ABORT: 1833 css_task_iter_end(&it); 1834 mem_cgroup_iter_break(memcg, iter); 1835 if (chosen) 1836 put_task_struct(chosen); 1837 return; 1838 case OOM_SCAN_OK: 1839 break; 1840 }; 1841 points = oom_badness(task, memcg, NULL, totalpages); 1842 if (points > chosen_points) { 1843 if (chosen) 1844 put_task_struct(chosen); 1845 chosen = task; 1846 chosen_points = points; 1847 get_task_struct(chosen); 1848 } 1849 } 1850 css_task_iter_end(&it); 1851 } 1852 1853 if (!chosen) 1854 return; 1855 points = chosen_points * 1000 / totalpages; 1856 oom_kill_process(chosen, gfp_mask, order, points, totalpages, memcg, 1857 NULL, "Memory cgroup out of memory"); 1858 } 1859 1860 static unsigned long mem_cgroup_reclaim(struct mem_cgroup *memcg, 1861 gfp_t gfp_mask, 1862 unsigned long flags) 1863 { 1864 unsigned long total = 0; 1865 bool noswap = false; 1866 int loop; 1867 1868 if (flags & MEM_CGROUP_RECLAIM_NOSWAP) 1869 noswap = true; 1870 if (!(flags & MEM_CGROUP_RECLAIM_SHRINK) && memcg->memsw_is_minimum) 1871 noswap = true; 1872 1873 for (loop = 0; loop < MEM_CGROUP_MAX_RECLAIM_LOOPS; loop++) { 1874 if (loop) 1875 drain_all_stock_async(memcg); 1876 total += try_to_free_mem_cgroup_pages(memcg, gfp_mask, noswap); 1877 /* 1878 * Allow limit shrinkers, which are triggered directly 1879 * by userspace, to catch signals and stop reclaim 1880 * after minimal progress, regardless of the margin. 1881 */ 1882 if (total && (flags & MEM_CGROUP_RECLAIM_SHRINK)) 1883 break; 1884 if (mem_cgroup_margin(memcg)) 1885 break; 1886 /* 1887 * If nothing was reclaimed after two attempts, there 1888 * may be no reclaimable pages in this hierarchy. 1889 */ 1890 if (loop && !total) 1891 break; 1892 } 1893 return total; 1894 } 1895 1896 /** 1897 * test_mem_cgroup_node_reclaimable 1898 * @memcg: the target memcg 1899 * @nid: the node ID to be checked. 1900 * @noswap : specify true here if the user wants flle only information. 1901 * 1902 * This function returns whether the specified memcg contains any 1903 * reclaimable pages on a node. Returns true if there are any reclaimable 1904 * pages in the node. 1905 */ 1906 static bool test_mem_cgroup_node_reclaimable(struct mem_cgroup *memcg, 1907 int nid, bool noswap) 1908 { 1909 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_FILE)) 1910 return true; 1911 if (noswap || !total_swap_pages) 1912 return false; 1913 if (mem_cgroup_node_nr_lru_pages(memcg, nid, LRU_ALL_ANON)) 1914 return true; 1915 return false; 1916 1917 } 1918 #if MAX_NUMNODES > 1 1919 1920 /* 1921 * Always updating the nodemask is not very good - even if we have an empty 1922 * list or the wrong list here, we can start from some node and traverse all 1923 * nodes based on the zonelist. So update the list loosely once per 10 secs. 1924 * 1925 */ 1926 static void mem_cgroup_may_update_nodemask(struct mem_cgroup *memcg) 1927 { 1928 int nid; 1929 /* 1930 * numainfo_events > 0 means there was at least NUMAINFO_EVENTS_TARGET 1931 * pagein/pageout changes since the last update. 1932 */ 1933 if (!atomic_read(&memcg->numainfo_events)) 1934 return; 1935 if (atomic_inc_return(&memcg->numainfo_updating) > 1) 1936 return; 1937 1938 /* make a nodemask where this memcg uses memory from */ 1939 memcg->scan_nodes = node_states[N_MEMORY]; 1940 1941 for_each_node_mask(nid, node_states[N_MEMORY]) { 1942 1943 if (!test_mem_cgroup_node_reclaimable(memcg, nid, false)) 1944 node_clear(nid, memcg->scan_nodes); 1945 } 1946 1947 atomic_set(&memcg->numainfo_events, 0); 1948 atomic_set(&memcg->numainfo_updating, 0); 1949 } 1950 1951 /* 1952 * Selecting a node where we start reclaim from. Because what we need is just 1953 * reducing usage counter, start from anywhere is O,K. Considering 1954 * memory reclaim from current node, there are pros. and cons. 1955 * 1956 * Freeing memory from current node means freeing memory from a node which 1957 * we'll use or we've used. So, it may make LRU bad. And if several threads 1958 * hit limits, it will see a contention on a node. But freeing from remote 1959 * node means more costs for memory reclaim because of memory latency. 1960 * 1961 * Now, we use round-robin. Better algorithm is welcomed. 1962 */ 1963 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 1964 { 1965 int node; 1966 1967 mem_cgroup_may_update_nodemask(memcg); 1968 node = memcg->last_scanned_node; 1969 1970 node = next_node(node, memcg->scan_nodes); 1971 if (node == MAX_NUMNODES) 1972 node = first_node(memcg->scan_nodes); 1973 /* 1974 * We call this when we hit limit, not when pages are added to LRU. 1975 * No LRU may hold pages because all pages are UNEVICTABLE or 1976 * memcg is too small and all pages are not on LRU. In that case, 1977 * we use curret node. 1978 */ 1979 if (unlikely(node == MAX_NUMNODES)) 1980 node = numa_node_id(); 1981 1982 memcg->last_scanned_node = node; 1983 return node; 1984 } 1985 1986 /* 1987 * Check all nodes whether it contains reclaimable pages or not. 1988 * For quick scan, we make use of scan_nodes. This will allow us to skip 1989 * unused nodes. But scan_nodes is lazily updated and may not cotain 1990 * enough new information. We need to do double check. 1991 */ 1992 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 1993 { 1994 int nid; 1995 1996 /* 1997 * quick check...making use of scan_node. 1998 * We can skip unused nodes. 1999 */ 2000 if (!nodes_empty(memcg->scan_nodes)) { 2001 for (nid = first_node(memcg->scan_nodes); 2002 nid < MAX_NUMNODES; 2003 nid = next_node(nid, memcg->scan_nodes)) { 2004 2005 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2006 return true; 2007 } 2008 } 2009 /* 2010 * Check rest of nodes. 2011 */ 2012 for_each_node_state(nid, N_MEMORY) { 2013 if (node_isset(nid, memcg->scan_nodes)) 2014 continue; 2015 if (test_mem_cgroup_node_reclaimable(memcg, nid, noswap)) 2016 return true; 2017 } 2018 return false; 2019 } 2020 2021 #else 2022 int mem_cgroup_select_victim_node(struct mem_cgroup *memcg) 2023 { 2024 return 0; 2025 } 2026 2027 static bool mem_cgroup_reclaimable(struct mem_cgroup *memcg, bool noswap) 2028 { 2029 return test_mem_cgroup_node_reclaimable(memcg, 0, noswap); 2030 } 2031 #endif 2032 2033 static int mem_cgroup_soft_reclaim(struct mem_cgroup *root_memcg, 2034 struct zone *zone, 2035 gfp_t gfp_mask, 2036 unsigned long *total_scanned) 2037 { 2038 struct mem_cgroup *victim = NULL; 2039 int total = 0; 2040 int loop = 0; 2041 unsigned long excess; 2042 unsigned long nr_scanned; 2043 struct mem_cgroup_reclaim_cookie reclaim = { 2044 .zone = zone, 2045 .priority = 0, 2046 }; 2047 2048 excess = res_counter_soft_limit_excess(&root_memcg->res) >> PAGE_SHIFT; 2049 2050 while (1) { 2051 victim = mem_cgroup_iter(root_memcg, victim, &reclaim); 2052 if (!victim) { 2053 loop++; 2054 if (loop >= 2) { 2055 /* 2056 * If we have not been able to reclaim 2057 * anything, it might because there are 2058 * no reclaimable pages under this hierarchy 2059 */ 2060 if (!total) 2061 break; 2062 /* 2063 * We want to do more targeted reclaim. 2064 * excess >> 2 is not to excessive so as to 2065 * reclaim too much, nor too less that we keep 2066 * coming back to reclaim from this cgroup 2067 */ 2068 if (total >= (excess >> 2) || 2069 (loop > MEM_CGROUP_MAX_RECLAIM_LOOPS)) 2070 break; 2071 } 2072 continue; 2073 } 2074 if (!mem_cgroup_reclaimable(victim, false)) 2075 continue; 2076 total += mem_cgroup_shrink_node_zone(victim, gfp_mask, false, 2077 zone, &nr_scanned); 2078 *total_scanned += nr_scanned; 2079 if (!res_counter_soft_limit_excess(&root_memcg->res)) 2080 break; 2081 } 2082 mem_cgroup_iter_break(root_memcg, victim); 2083 return total; 2084 } 2085 2086 #ifdef CONFIG_LOCKDEP 2087 static struct lockdep_map memcg_oom_lock_dep_map = { 2088 .name = "memcg_oom_lock", 2089 }; 2090 #endif 2091 2092 static DEFINE_SPINLOCK(memcg_oom_lock); 2093 2094 /* 2095 * Check OOM-Killer is already running under our hierarchy. 2096 * If someone is running, return false. 2097 */ 2098 static bool mem_cgroup_oom_trylock(struct mem_cgroup *memcg) 2099 { 2100 struct mem_cgroup *iter, *failed = NULL; 2101 2102 spin_lock(&memcg_oom_lock); 2103 2104 for_each_mem_cgroup_tree(iter, memcg) { 2105 if (iter->oom_lock) { 2106 /* 2107 * this subtree of our hierarchy is already locked 2108 * so we cannot give a lock. 2109 */ 2110 failed = iter; 2111 mem_cgroup_iter_break(memcg, iter); 2112 break; 2113 } else 2114 iter->oom_lock = true; 2115 } 2116 2117 if (failed) { 2118 /* 2119 * OK, we failed to lock the whole subtree so we have 2120 * to clean up what we set up to the failing subtree 2121 */ 2122 for_each_mem_cgroup_tree(iter, memcg) { 2123 if (iter == failed) { 2124 mem_cgroup_iter_break(memcg, iter); 2125 break; 2126 } 2127 iter->oom_lock = false; 2128 } 2129 } else 2130 mutex_acquire(&memcg_oom_lock_dep_map, 0, 1, _RET_IP_); 2131 2132 spin_unlock(&memcg_oom_lock); 2133 2134 return !failed; 2135 } 2136 2137 static void mem_cgroup_oom_unlock(struct mem_cgroup *memcg) 2138 { 2139 struct mem_cgroup *iter; 2140 2141 spin_lock(&memcg_oom_lock); 2142 mutex_release(&memcg_oom_lock_dep_map, 1, _RET_IP_); 2143 for_each_mem_cgroup_tree(iter, memcg) 2144 iter->oom_lock = false; 2145 spin_unlock(&memcg_oom_lock); 2146 } 2147 2148 static void mem_cgroup_mark_under_oom(struct mem_cgroup *memcg) 2149 { 2150 struct mem_cgroup *iter; 2151 2152 for_each_mem_cgroup_tree(iter, memcg) 2153 atomic_inc(&iter->under_oom); 2154 } 2155 2156 static void mem_cgroup_unmark_under_oom(struct mem_cgroup *memcg) 2157 { 2158 struct mem_cgroup *iter; 2159 2160 /* 2161 * When a new child is created while the hierarchy is under oom, 2162 * mem_cgroup_oom_lock() may not be called. We have to use 2163 * atomic_add_unless() here. 2164 */ 2165 for_each_mem_cgroup_tree(iter, memcg) 2166 atomic_add_unless(&iter->under_oom, -1, 0); 2167 } 2168 2169 static DECLARE_WAIT_QUEUE_HEAD(memcg_oom_waitq); 2170 2171 struct oom_wait_info { 2172 struct mem_cgroup *memcg; 2173 wait_queue_t wait; 2174 }; 2175 2176 static int memcg_oom_wake_function(wait_queue_t *wait, 2177 unsigned mode, int sync, void *arg) 2178 { 2179 struct mem_cgroup *wake_memcg = (struct mem_cgroup *)arg; 2180 struct mem_cgroup *oom_wait_memcg; 2181 struct oom_wait_info *oom_wait_info; 2182 2183 oom_wait_info = container_of(wait, struct oom_wait_info, wait); 2184 oom_wait_memcg = oom_wait_info->memcg; 2185 2186 /* 2187 * Both of oom_wait_info->memcg and wake_memcg are stable under us. 2188 * Then we can use css_is_ancestor without taking care of RCU. 2189 */ 2190 if (!mem_cgroup_same_or_subtree(oom_wait_memcg, wake_memcg) 2191 && !mem_cgroup_same_or_subtree(wake_memcg, oom_wait_memcg)) 2192 return 0; 2193 return autoremove_wake_function(wait, mode, sync, arg); 2194 } 2195 2196 static void memcg_wakeup_oom(struct mem_cgroup *memcg) 2197 { 2198 atomic_inc(&memcg->oom_wakeups); 2199 /* for filtering, pass "memcg" as argument. */ 2200 __wake_up(&memcg_oom_waitq, TASK_NORMAL, 0, memcg); 2201 } 2202 2203 static void memcg_oom_recover(struct mem_cgroup *memcg) 2204 { 2205 if (memcg && atomic_read(&memcg->under_oom)) 2206 memcg_wakeup_oom(memcg); 2207 } 2208 2209 static void mem_cgroup_oom(struct mem_cgroup *memcg, gfp_t mask, int order) 2210 { 2211 if (!current->memcg_oom.may_oom) 2212 return; 2213 /* 2214 * We are in the middle of the charge context here, so we 2215 * don't want to block when potentially sitting on a callstack 2216 * that holds all kinds of filesystem and mm locks. 2217 * 2218 * Also, the caller may handle a failed allocation gracefully 2219 * (like optional page cache readahead) and so an OOM killer 2220 * invocation might not even be necessary. 2221 * 2222 * That's why we don't do anything here except remember the 2223 * OOM context and then deal with it at the end of the page 2224 * fault when the stack is unwound, the locks are released, 2225 * and when we know whether the fault was overall successful. 2226 */ 2227 css_get(&memcg->css); 2228 current->memcg_oom.memcg = memcg; 2229 current->memcg_oom.gfp_mask = mask; 2230 current->memcg_oom.order = order; 2231 } 2232 2233 /** 2234 * mem_cgroup_oom_synchronize - complete memcg OOM handling 2235 * @handle: actually kill/wait or just clean up the OOM state 2236 * 2237 * This has to be called at the end of a page fault if the memcg OOM 2238 * handler was enabled. 2239 * 2240 * Memcg supports userspace OOM handling where failed allocations must 2241 * sleep on a waitqueue until the userspace task resolves the 2242 * situation. Sleeping directly in the charge context with all kinds 2243 * of locks held is not a good idea, instead we remember an OOM state 2244 * in the task and mem_cgroup_oom_synchronize() has to be called at 2245 * the end of the page fault to complete the OOM handling. 2246 * 2247 * Returns %true if an ongoing memcg OOM situation was detected and 2248 * completed, %false otherwise. 2249 */ 2250 bool mem_cgroup_oom_synchronize(bool handle) 2251 { 2252 struct mem_cgroup *memcg = current->memcg_oom.memcg; 2253 struct oom_wait_info owait; 2254 bool locked; 2255 2256 /* OOM is global, do not handle */ 2257 if (!memcg) 2258 return false; 2259 2260 if (!handle) 2261 goto cleanup; 2262 2263 owait.memcg = memcg; 2264 owait.wait.flags = 0; 2265 owait.wait.func = memcg_oom_wake_function; 2266 owait.wait.private = current; 2267 INIT_LIST_HEAD(&owait.wait.task_list); 2268 2269 prepare_to_wait(&memcg_oom_waitq, &owait.wait, TASK_KILLABLE); 2270 mem_cgroup_mark_under_oom(memcg); 2271 2272 locked = mem_cgroup_oom_trylock(memcg); 2273 2274 if (locked) 2275 mem_cgroup_oom_notify(memcg); 2276 2277 if (locked && !memcg->oom_kill_disable) { 2278 mem_cgroup_unmark_under_oom(memcg); 2279 finish_wait(&memcg_oom_waitq, &owait.wait); 2280 mem_cgroup_out_of_memory(memcg, current->memcg_oom.gfp_mask, 2281 current->memcg_oom.order); 2282 } else { 2283 schedule(); 2284 mem_cgroup_unmark_under_oom(memcg); 2285 finish_wait(&memcg_oom_waitq, &owait.wait); 2286 } 2287 2288 if (locked) { 2289 mem_cgroup_oom_unlock(memcg); 2290 /* 2291 * There is no guarantee that an OOM-lock contender 2292 * sees the wakeups triggered by the OOM kill 2293 * uncharges. Wake any sleepers explicitely. 2294 */ 2295 memcg_oom_recover(memcg); 2296 } 2297 cleanup: 2298 current->memcg_oom.memcg = NULL; 2299 css_put(&memcg->css); 2300 return true; 2301 } 2302 2303 /* 2304 * Currently used to update mapped file statistics, but the routine can be 2305 * generalized to update other statistics as well. 2306 * 2307 * Notes: Race condition 2308 * 2309 * We usually use page_cgroup_lock() for accessing page_cgroup member but 2310 * it tends to be costly. But considering some conditions, we doesn't need 2311 * to do so _always_. 2312 * 2313 * Considering "charge", lock_page_cgroup() is not required because all 2314 * file-stat operations happen after a page is attached to radix-tree. There 2315 * are no race with "charge". 2316 * 2317 * Considering "uncharge", we know that memcg doesn't clear pc->mem_cgroup 2318 * at "uncharge" intentionally. So, we always see valid pc->mem_cgroup even 2319 * if there are race with "uncharge". Statistics itself is properly handled 2320 * by flags. 2321 * 2322 * Considering "move", this is an only case we see a race. To make the race 2323 * small, we check mm->moving_account and detect there are possibility of race 2324 * If there is, we take a lock. 2325 */ 2326 2327 void __mem_cgroup_begin_update_page_stat(struct page *page, 2328 bool *locked, unsigned long *flags) 2329 { 2330 struct mem_cgroup *memcg; 2331 struct page_cgroup *pc; 2332 2333 pc = lookup_page_cgroup(page); 2334 again: 2335 memcg = pc->mem_cgroup; 2336 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2337 return; 2338 /* 2339 * If this memory cgroup is not under account moving, we don't 2340 * need to take move_lock_mem_cgroup(). Because we already hold 2341 * rcu_read_lock(), any calls to move_account will be delayed until 2342 * rcu_read_unlock() if mem_cgroup_stolen() == true. 2343 */ 2344 if (!mem_cgroup_stolen(memcg)) 2345 return; 2346 2347 move_lock_mem_cgroup(memcg, flags); 2348 if (memcg != pc->mem_cgroup || !PageCgroupUsed(pc)) { 2349 move_unlock_mem_cgroup(memcg, flags); 2350 goto again; 2351 } 2352 *locked = true; 2353 } 2354 2355 void __mem_cgroup_end_update_page_stat(struct page *page, unsigned long *flags) 2356 { 2357 struct page_cgroup *pc = lookup_page_cgroup(page); 2358 2359 /* 2360 * It's guaranteed that pc->mem_cgroup never changes while 2361 * lock is held because a routine modifies pc->mem_cgroup 2362 * should take move_lock_mem_cgroup(). 2363 */ 2364 move_unlock_mem_cgroup(pc->mem_cgroup, flags); 2365 } 2366 2367 void mem_cgroup_update_page_stat(struct page *page, 2368 enum mem_cgroup_stat_index idx, int val) 2369 { 2370 struct mem_cgroup *memcg; 2371 struct page_cgroup *pc = lookup_page_cgroup(page); 2372 unsigned long uninitialized_var(flags); 2373 2374 if (mem_cgroup_disabled()) 2375 return; 2376 2377 VM_BUG_ON(!rcu_read_lock_held()); 2378 memcg = pc->mem_cgroup; 2379 if (unlikely(!memcg || !PageCgroupUsed(pc))) 2380 return; 2381 2382 this_cpu_add(memcg->stat->count[idx], val); 2383 } 2384 2385 /* 2386 * size of first charge trial. "32" comes from vmscan.c's magic value. 2387 * TODO: maybe necessary to use big numbers in big irons. 2388 */ 2389 #define CHARGE_BATCH 32U 2390 struct memcg_stock_pcp { 2391 struct mem_cgroup *cached; /* this never be root cgroup */ 2392 unsigned int nr_pages; 2393 struct work_struct work; 2394 unsigned long flags; 2395 #define FLUSHING_CACHED_CHARGE 0 2396 }; 2397 static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); 2398 static DEFINE_MUTEX(percpu_charge_mutex); 2399 2400 /** 2401 * consume_stock: Try to consume stocked charge on this cpu. 2402 * @memcg: memcg to consume from. 2403 * @nr_pages: how many pages to charge. 2404 * 2405 * The charges will only happen if @memcg matches the current cpu's memcg 2406 * stock, and at least @nr_pages are available in that stock. Failure to 2407 * service an allocation will refill the stock. 2408 * 2409 * returns true if successful, false otherwise. 2410 */ 2411 static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2412 { 2413 struct memcg_stock_pcp *stock; 2414 bool ret = true; 2415 2416 if (nr_pages > CHARGE_BATCH) 2417 return false; 2418 2419 stock = &get_cpu_var(memcg_stock); 2420 if (memcg == stock->cached && stock->nr_pages >= nr_pages) 2421 stock->nr_pages -= nr_pages; 2422 else /* need to call res_counter_charge */ 2423 ret = false; 2424 put_cpu_var(memcg_stock); 2425 return ret; 2426 } 2427 2428 /* 2429 * Returns stocks cached in percpu to res_counter and reset cached information. 2430 */ 2431 static void drain_stock(struct memcg_stock_pcp *stock) 2432 { 2433 struct mem_cgroup *old = stock->cached; 2434 2435 if (stock->nr_pages) { 2436 unsigned long bytes = stock->nr_pages * PAGE_SIZE; 2437 2438 res_counter_uncharge(&old->res, bytes); 2439 if (do_swap_account) 2440 res_counter_uncharge(&old->memsw, bytes); 2441 stock->nr_pages = 0; 2442 } 2443 stock->cached = NULL; 2444 } 2445 2446 /* 2447 * This must be called under preempt disabled or must be called by 2448 * a thread which is pinned to local cpu. 2449 */ 2450 static void drain_local_stock(struct work_struct *dummy) 2451 { 2452 struct memcg_stock_pcp *stock = &__get_cpu_var(memcg_stock); 2453 drain_stock(stock); 2454 clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); 2455 } 2456 2457 static void __init memcg_stock_init(void) 2458 { 2459 int cpu; 2460 2461 for_each_possible_cpu(cpu) { 2462 struct memcg_stock_pcp *stock = 2463 &per_cpu(memcg_stock, cpu); 2464 INIT_WORK(&stock->work, drain_local_stock); 2465 } 2466 } 2467 2468 /* 2469 * Cache charges(val) which is from res_counter, to local per_cpu area. 2470 * This will be consumed by consume_stock() function, later. 2471 */ 2472 static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) 2473 { 2474 struct memcg_stock_pcp *stock = &get_cpu_var(memcg_stock); 2475 2476 if (stock->cached != memcg) { /* reset if necessary */ 2477 drain_stock(stock); 2478 stock->cached = memcg; 2479 } 2480 stock->nr_pages += nr_pages; 2481 put_cpu_var(memcg_stock); 2482 } 2483 2484 /* 2485 * Drains all per-CPU charge caches for given root_memcg resp. subtree 2486 * of the hierarchy under it. sync flag says whether we should block 2487 * until the work is done. 2488 */ 2489 static void drain_all_stock(struct mem_cgroup *root_memcg, bool sync) 2490 { 2491 int cpu, curcpu; 2492 2493 /* Notify other cpus that system-wide "drain" is running */ 2494 get_online_cpus(); 2495 curcpu = get_cpu(); 2496 for_each_online_cpu(cpu) { 2497 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2498 struct mem_cgroup *memcg; 2499 2500 memcg = stock->cached; 2501 if (!memcg || !stock->nr_pages) 2502 continue; 2503 if (!mem_cgroup_same_or_subtree(root_memcg, memcg)) 2504 continue; 2505 if (!test_and_set_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) { 2506 if (cpu == curcpu) 2507 drain_local_stock(&stock->work); 2508 else 2509 schedule_work_on(cpu, &stock->work); 2510 } 2511 } 2512 put_cpu(); 2513 2514 if (!sync) 2515 goto out; 2516 2517 for_each_online_cpu(cpu) { 2518 struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); 2519 if (test_bit(FLUSHING_CACHED_CHARGE, &stock->flags)) 2520 flush_work(&stock->work); 2521 } 2522 out: 2523 put_online_cpus(); 2524 } 2525 2526 /* 2527 * Tries to drain stocked charges in other cpus. This function is asynchronous 2528 * and just put a work per cpu for draining localy on each cpu. Caller can 2529 * expects some charges will be back to res_counter later but cannot wait for 2530 * it. 2531 */ 2532 static void drain_all_stock_async(struct mem_cgroup *root_memcg) 2533 { 2534 /* 2535 * If someone calls draining, avoid adding more kworker runs. 2536 */ 2537 if (!mutex_trylock(&percpu_charge_mutex)) 2538 return; 2539 drain_all_stock(root_memcg, false); 2540 mutex_unlock(&percpu_charge_mutex); 2541 } 2542 2543 /* This is a synchronous drain interface. */ 2544 static void drain_all_stock_sync(struct mem_cgroup *root_memcg) 2545 { 2546 /* called when force_empty is called */ 2547 mutex_lock(&percpu_charge_mutex); 2548 drain_all_stock(root_memcg, true); 2549 mutex_unlock(&percpu_charge_mutex); 2550 } 2551 2552 /* 2553 * This function drains percpu counter value from DEAD cpu and 2554 * move it to local cpu. Note that this function can be preempted. 2555 */ 2556 static void mem_cgroup_drain_pcp_counter(struct mem_cgroup *memcg, int cpu) 2557 { 2558 int i; 2559 2560 spin_lock(&memcg->pcp_counter_lock); 2561 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 2562 long x = per_cpu(memcg->stat->count[i], cpu); 2563 2564 per_cpu(memcg->stat->count[i], cpu) = 0; 2565 memcg->nocpu_base.count[i] += x; 2566 } 2567 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 2568 unsigned long x = per_cpu(memcg->stat->events[i], cpu); 2569 2570 per_cpu(memcg->stat->events[i], cpu) = 0; 2571 memcg->nocpu_base.events[i] += x; 2572 } 2573 spin_unlock(&memcg->pcp_counter_lock); 2574 } 2575 2576 static int memcg_cpu_hotplug_callback(struct notifier_block *nb, 2577 unsigned long action, 2578 void *hcpu) 2579 { 2580 int cpu = (unsigned long)hcpu; 2581 struct memcg_stock_pcp *stock; 2582 struct mem_cgroup *iter; 2583 2584 if (action == CPU_ONLINE) 2585 return NOTIFY_OK; 2586 2587 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 2588 return NOTIFY_OK; 2589 2590 for_each_mem_cgroup(iter) 2591 mem_cgroup_drain_pcp_counter(iter, cpu); 2592 2593 stock = &per_cpu(memcg_stock, cpu); 2594 drain_stock(stock); 2595 return NOTIFY_OK; 2596 } 2597 2598 2599 /* See __mem_cgroup_try_charge() for details */ 2600 enum { 2601 CHARGE_OK, /* success */ 2602 CHARGE_RETRY, /* need to retry but retry is not bad */ 2603 CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ 2604 CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ 2605 }; 2606 2607 static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, 2608 unsigned int nr_pages, unsigned int min_pages, 2609 bool invoke_oom) 2610 { 2611 unsigned long csize = nr_pages * PAGE_SIZE; 2612 struct mem_cgroup *mem_over_limit; 2613 struct res_counter *fail_res; 2614 unsigned long flags = 0; 2615 int ret; 2616 2617 ret = res_counter_charge(&memcg->res, csize, &fail_res); 2618 2619 if (likely(!ret)) { 2620 if (!do_swap_account) 2621 return CHARGE_OK; 2622 ret = res_counter_charge(&memcg->memsw, csize, &fail_res); 2623 if (likely(!ret)) 2624 return CHARGE_OK; 2625 2626 res_counter_uncharge(&memcg->res, csize); 2627 mem_over_limit = mem_cgroup_from_res_counter(fail_res, memsw); 2628 flags |= MEM_CGROUP_RECLAIM_NOSWAP; 2629 } else 2630 mem_over_limit = mem_cgroup_from_res_counter(fail_res, res); 2631 /* 2632 * Never reclaim on behalf of optional batching, retry with a 2633 * single page instead. 2634 */ 2635 if (nr_pages > min_pages) 2636 return CHARGE_RETRY; 2637 2638 if (!(gfp_mask & __GFP_WAIT)) 2639 return CHARGE_WOULDBLOCK; 2640 2641 if (gfp_mask & __GFP_NORETRY) 2642 return CHARGE_NOMEM; 2643 2644 ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); 2645 if (mem_cgroup_margin(mem_over_limit) >= nr_pages) 2646 return CHARGE_RETRY; 2647 /* 2648 * Even though the limit is exceeded at this point, reclaim 2649 * may have been able to free some pages. Retry the charge 2650 * before killing the task. 2651 * 2652 * Only for regular pages, though: huge pages are rather 2653 * unlikely to succeed so close to the limit, and we fall back 2654 * to regular pages anyway in case of failure. 2655 */ 2656 if (nr_pages <= (1 << PAGE_ALLOC_COSTLY_ORDER) && ret) 2657 return CHARGE_RETRY; 2658 2659 /* 2660 * At task move, charge accounts can be doubly counted. So, it's 2661 * better to wait until the end of task_move if something is going on. 2662 */ 2663 if (mem_cgroup_wait_acct_move(mem_over_limit)) 2664 return CHARGE_RETRY; 2665 2666 if (invoke_oom) 2667 mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(csize)); 2668 2669 return CHARGE_NOMEM; 2670 } 2671 2672 /* 2673 * __mem_cgroup_try_charge() does 2674 * 1. detect memcg to be charged against from passed *mm and *ptr, 2675 * 2. update res_counter 2676 * 3. call memory reclaim if necessary. 2677 * 2678 * In some special case, if the task is fatal, fatal_signal_pending() or 2679 * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup 2680 * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon 2681 * as possible without any hazards. 2: all pages should have a valid 2682 * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg 2683 * pointer, that is treated as a charge to root_mem_cgroup. 2684 * 2685 * So __mem_cgroup_try_charge() will return 2686 * 0 ... on success, filling *ptr with a valid memcg pointer. 2687 * -ENOMEM ... charge failure because of resource limits. 2688 * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. 2689 * 2690 * Unlike the exported interface, an "oom" parameter is added. if oom==true, 2691 * the oom-killer can be invoked. 2692 */ 2693 static int __mem_cgroup_try_charge(struct mm_struct *mm, 2694 gfp_t gfp_mask, 2695 unsigned int nr_pages, 2696 struct mem_cgroup **ptr, 2697 bool oom) 2698 { 2699 unsigned int batch = max(CHARGE_BATCH, nr_pages); 2700 int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; 2701 struct mem_cgroup *memcg = NULL; 2702 int ret; 2703 2704 /* 2705 * Unlike gloval-vm's OOM-kill, we're not in memory shortage 2706 * in system level. So, allow to go ahead dying process in addition to 2707 * MEMDIE process. 2708 */ 2709 if (unlikely(test_thread_flag(TIF_MEMDIE) 2710 || fatal_signal_pending(current))) 2711 goto bypass; 2712 2713 if (unlikely(task_in_memcg_oom(current))) 2714 goto nomem; 2715 2716 if (gfp_mask & __GFP_NOFAIL) 2717 oom = false; 2718 2719 /* 2720 * We always charge the cgroup the mm_struct belongs to. 2721 * The mm_struct's mem_cgroup changes on task migration if the 2722 * thread group leader migrates. It's possible that mm is not 2723 * set, if so charge the root memcg (happens for pagecache usage). 2724 */ 2725 if (!*ptr && !mm) 2726 *ptr = root_mem_cgroup; 2727 again: 2728 if (*ptr) { /* css should be a valid one */ 2729 memcg = *ptr; 2730 if (mem_cgroup_is_root(memcg)) 2731 goto done; 2732 if (consume_stock(memcg, nr_pages)) 2733 goto done; 2734 css_get(&memcg->css); 2735 } else { 2736 struct task_struct *p; 2737 2738 rcu_read_lock(); 2739 p = rcu_dereference(mm->owner); 2740 /* 2741 * Because we don't have task_lock(), "p" can exit. 2742 * In that case, "memcg" can point to root or p can be NULL with 2743 * race with swapoff. Then, we have small risk of mis-accouning. 2744 * But such kind of mis-account by race always happens because 2745 * we don't have cgroup_mutex(). It's overkill and we allo that 2746 * small race, here. 2747 * (*) swapoff at el will charge against mm-struct not against 2748 * task-struct. So, mm->owner can be NULL. 2749 */ 2750 memcg = mem_cgroup_from_task(p); 2751 if (!memcg) 2752 memcg = root_mem_cgroup; 2753 if (mem_cgroup_is_root(memcg)) { 2754 rcu_read_unlock(); 2755 goto done; 2756 } 2757 if (consume_stock(memcg, nr_pages)) { 2758 /* 2759 * It seems dagerous to access memcg without css_get(). 2760 * But considering how consume_stok works, it's not 2761 * necessary. If consume_stock success, some charges 2762 * from this memcg are cached on this cpu. So, we 2763 * don't need to call css_get()/css_tryget() before 2764 * calling consume_stock(). 2765 */ 2766 rcu_read_unlock(); 2767 goto done; 2768 } 2769 /* after here, we may be blocked. we need to get refcnt */ 2770 if (!css_tryget(&memcg->css)) { 2771 rcu_read_unlock(); 2772 goto again; 2773 } 2774 rcu_read_unlock(); 2775 } 2776 2777 do { 2778 bool invoke_oom = oom && !nr_oom_retries; 2779 2780 /* If killed, bypass charge */ 2781 if (fatal_signal_pending(current)) { 2782 css_put(&memcg->css); 2783 goto bypass; 2784 } 2785 2786 ret = mem_cgroup_do_charge(memcg, gfp_mask, batch, 2787 nr_pages, invoke_oom); 2788 switch (ret) { 2789 case CHARGE_OK: 2790 break; 2791 case CHARGE_RETRY: /* not in OOM situation but retry */ 2792 batch = nr_pages; 2793 css_put(&memcg->css); 2794 memcg = NULL; 2795 goto again; 2796 case CHARGE_WOULDBLOCK: /* !__GFP_WAIT */ 2797 css_put(&memcg->css); 2798 goto nomem; 2799 case CHARGE_NOMEM: /* OOM routine works */ 2800 if (!oom || invoke_oom) { 2801 css_put(&memcg->css); 2802 goto nomem; 2803 } 2804 nr_oom_retries--; 2805 break; 2806 } 2807 } while (ret != CHARGE_OK); 2808 2809 if (batch > nr_pages) 2810 refill_stock(memcg, batch - nr_pages); 2811 css_put(&memcg->css); 2812 done: 2813 *ptr = memcg; 2814 return 0; 2815 nomem: 2816 if (!(gfp_mask & __GFP_NOFAIL)) { 2817 *ptr = NULL; 2818 return -ENOMEM; 2819 } 2820 bypass: 2821 *ptr = root_mem_cgroup; 2822 return -EINTR; 2823 } 2824 2825 /* 2826 * Somemtimes we have to undo a charge we got by try_charge(). 2827 * This function is for that and do uncharge, put css's refcnt. 2828 * gotten by try_charge(). 2829 */ 2830 static void __mem_cgroup_cancel_charge(struct mem_cgroup *memcg, 2831 unsigned int nr_pages) 2832 { 2833 if (!mem_cgroup_is_root(memcg)) { 2834 unsigned long bytes = nr_pages * PAGE_SIZE; 2835 2836 res_counter_uncharge(&memcg->res, bytes); 2837 if (do_swap_account) 2838 res_counter_uncharge(&memcg->memsw, bytes); 2839 } 2840 } 2841 2842 /* 2843 * Cancel chrages in this cgroup....doesn't propagate to parent cgroup. 2844 * This is useful when moving usage to parent cgroup. 2845 */ 2846 static void __mem_cgroup_cancel_local_charge(struct mem_cgroup *memcg, 2847 unsigned int nr_pages) 2848 { 2849 unsigned long bytes = nr_pages * PAGE_SIZE; 2850 2851 if (mem_cgroup_is_root(memcg)) 2852 return; 2853 2854 res_counter_uncharge_until(&memcg->res, memcg->res.parent, bytes); 2855 if (do_swap_account) 2856 res_counter_uncharge_until(&memcg->memsw, 2857 memcg->memsw.parent, bytes); 2858 } 2859 2860 /* 2861 * A helper function to get mem_cgroup from ID. must be called under 2862 * rcu_read_lock(). The caller is responsible for calling css_tryget if 2863 * the mem_cgroup is used for charging. (dropping refcnt from swap can be 2864 * called against removed memcg.) 2865 */ 2866 static struct mem_cgroup *mem_cgroup_lookup(unsigned short id) 2867 { 2868 /* ID 0 is unused ID */ 2869 if (!id) 2870 return NULL; 2871 return mem_cgroup_from_id(id); 2872 } 2873 2874 struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) 2875 { 2876 struct mem_cgroup *memcg = NULL; 2877 struct page_cgroup *pc; 2878 unsigned short id; 2879 swp_entry_t ent; 2880 2881 VM_BUG_ON(!PageLocked(page)); 2882 2883 pc = lookup_page_cgroup(page); 2884 lock_page_cgroup(pc); 2885 if (PageCgroupUsed(pc)) { 2886 memcg = pc->mem_cgroup; 2887 if (memcg && !css_tryget(&memcg->css)) 2888 memcg = NULL; 2889 } else if (PageSwapCache(page)) { 2890 ent.val = page_private(page); 2891 id = lookup_swap_cgroup_id(ent); 2892 rcu_read_lock(); 2893 memcg = mem_cgroup_lookup(id); 2894 if (memcg && !css_tryget(&memcg->css)) 2895 memcg = NULL; 2896 rcu_read_unlock(); 2897 } 2898 unlock_page_cgroup(pc); 2899 return memcg; 2900 } 2901 2902 static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, 2903 struct page *page, 2904 unsigned int nr_pages, 2905 enum charge_type ctype, 2906 bool lrucare) 2907 { 2908 struct page_cgroup *pc = lookup_page_cgroup(page); 2909 struct zone *uninitialized_var(zone); 2910 struct lruvec *lruvec; 2911 bool was_on_lru = false; 2912 bool anon; 2913 2914 lock_page_cgroup(pc); 2915 VM_BUG_ON(PageCgroupUsed(pc)); 2916 /* 2917 * we don't need page_cgroup_lock about tail pages, becase they are not 2918 * accessed by any other context at this point. 2919 */ 2920 2921 /* 2922 * In some cases, SwapCache and FUSE(splice_buf->radixtree), the page 2923 * may already be on some other mem_cgroup's LRU. Take care of it. 2924 */ 2925 if (lrucare) { 2926 zone = page_zone(page); 2927 spin_lock_irq(&zone->lru_lock); 2928 if (PageLRU(page)) { 2929 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2930 ClearPageLRU(page); 2931 del_page_from_lru_list(page, lruvec, page_lru(page)); 2932 was_on_lru = true; 2933 } 2934 } 2935 2936 pc->mem_cgroup = memcg; 2937 /* 2938 * We access a page_cgroup asynchronously without lock_page_cgroup(). 2939 * Especially when a page_cgroup is taken from a page, pc->mem_cgroup 2940 * is accessed after testing USED bit. To make pc->mem_cgroup visible 2941 * before USED bit, we need memory barrier here. 2942 * See mem_cgroup_add_lru_list(), etc. 2943 */ 2944 smp_wmb(); 2945 SetPageCgroupUsed(pc); 2946 2947 if (lrucare) { 2948 if (was_on_lru) { 2949 lruvec = mem_cgroup_zone_lruvec(zone, pc->mem_cgroup); 2950 VM_BUG_ON(PageLRU(page)); 2951 SetPageLRU(page); 2952 add_page_to_lru_list(page, lruvec, page_lru(page)); 2953 } 2954 spin_unlock_irq(&zone->lru_lock); 2955 } 2956 2957 if (ctype == MEM_CGROUP_CHARGE_TYPE_ANON) 2958 anon = true; 2959 else 2960 anon = false; 2961 2962 mem_cgroup_charge_statistics(memcg, page, anon, nr_pages); 2963 unlock_page_cgroup(pc); 2964 2965 /* 2966 * "charge_statistics" updated event counter. Then, check it. 2967 * Insert ancestor (and ancestor's ancestors), to softlimit RB-tree. 2968 * if they exceeds softlimit. 2969 */ 2970 memcg_check_events(memcg, page); 2971 } 2972 2973 static DEFINE_MUTEX(set_limit_mutex); 2974 2975 #ifdef CONFIG_MEMCG_KMEM 2976 static inline bool memcg_can_account_kmem(struct mem_cgroup *memcg) 2977 { 2978 return !mem_cgroup_disabled() && !mem_cgroup_is_root(memcg) && 2979 (memcg->kmem_account_flags & KMEM_ACCOUNTED_MASK); 2980 } 2981 2982 /* 2983 * This is a bit cumbersome, but it is rarely used and avoids a backpointer 2984 * in the memcg_cache_params struct. 2985 */ 2986 static struct kmem_cache *memcg_params_to_cache(struct memcg_cache_params *p) 2987 { 2988 struct kmem_cache *cachep; 2989 2990 VM_BUG_ON(p->is_root_cache); 2991 cachep = p->root_cache; 2992 return cache_from_memcg_idx(cachep, memcg_cache_id(p->memcg)); 2993 } 2994 2995 #ifdef CONFIG_SLABINFO 2996 static int mem_cgroup_slabinfo_read(struct cgroup_subsys_state *css, 2997 struct cftype *cft, struct seq_file *m) 2998 { 2999 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 3000 struct memcg_cache_params *params; 3001 3002 if (!memcg_can_account_kmem(memcg)) 3003 return -EIO; 3004 3005 print_slabinfo_header(m); 3006 3007 mutex_lock(&memcg->slab_caches_mutex); 3008 list_for_each_entry(params, &memcg->memcg_slab_caches, list) 3009 cache_show(memcg_params_to_cache(params), m); 3010 mutex_unlock(&memcg->slab_caches_mutex); 3011 3012 return 0; 3013 } 3014 #endif 3015 3016 static int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, u64 size) 3017 { 3018 struct res_counter *fail_res; 3019 struct mem_cgroup *_memcg; 3020 int ret = 0; 3021 3022 ret = res_counter_charge(&memcg->kmem, size, &fail_res); 3023 if (ret) 3024 return ret; 3025 3026 _memcg = memcg; 3027 ret = __mem_cgroup_try_charge(NULL, gfp, size >> PAGE_SHIFT, 3028 &_memcg, oom_gfp_allowed(gfp)); 3029 3030 if (ret == -EINTR) { 3031 /* 3032 * __mem_cgroup_try_charge() chosed to bypass to root due to 3033 * OOM kill or fatal signal. Since our only options are to 3034 * either fail the allocation or charge it to this cgroup, do 3035 * it as a temporary condition. But we can't fail. From a 3036 * kmem/slab perspective, the cache has already been selected, 3037 * by mem_cgroup_kmem_get_cache(), so it is too late to change 3038 * our minds. 3039 * 3040 * This condition will only trigger if the task entered 3041 * memcg_charge_kmem in a sane state, but was OOM-killed during 3042 * __mem_cgroup_try_charge() above. Tasks that were already 3043 * dying when the allocation triggers should have been already 3044 * directed to the root cgroup in memcontrol.h 3045 */ 3046 res_counter_charge_nofail(&memcg->res, size, &fail_res); 3047 if (do_swap_account) 3048 res_counter_charge_nofail(&memcg->memsw, size, 3049 &fail_res); 3050 ret = 0; 3051 } else if (ret) 3052 res_counter_uncharge(&memcg->kmem, size); 3053 3054 return ret; 3055 } 3056 3057 static void memcg_uncharge_kmem(struct mem_cgroup *memcg, u64 size) 3058 { 3059 res_counter_uncharge(&memcg->res, size); 3060 if (do_swap_account) 3061 res_counter_uncharge(&memcg->memsw, size); 3062 3063 /* Not down to 0 */ 3064 if (res_counter_uncharge(&memcg->kmem, size)) 3065 return; 3066 3067 /* 3068 * Releases a reference taken in kmem_cgroup_css_offline in case 3069 * this last uncharge is racing with the offlining code or it is 3070 * outliving the memcg existence. 3071 * 3072 * The memory barrier imposed by test&clear is paired with the 3073 * explicit one in memcg_kmem_mark_dead(). 3074 */ 3075 if (memcg_kmem_test_and_clear_dead(memcg)) 3076 css_put(&memcg->css); 3077 } 3078 3079 void memcg_cache_list_add(struct mem_cgroup *memcg, struct kmem_cache *cachep) 3080 { 3081 if (!memcg) 3082 return; 3083 3084 mutex_lock(&memcg->slab_caches_mutex); 3085 list_add(&cachep->memcg_params->list, &memcg->memcg_slab_caches); 3086 mutex_unlock(&memcg->slab_caches_mutex); 3087 } 3088 3089 /* 3090 * helper for acessing a memcg's index. It will be used as an index in the 3091 * child cache array in kmem_cache, and also to derive its name. This function 3092 * will return -1 when this is not a kmem-limited memcg. 3093 */ 3094 int memcg_cache_id(struct mem_cgroup *memcg) 3095 { 3096 return memcg ? memcg->kmemcg_id : -1; 3097 } 3098 3099 /* 3100 * This ends up being protected by the set_limit mutex, during normal 3101 * operation, because that is its main call site. 3102 * 3103 * But when we create a new cache, we can call this as well if its parent 3104 * is kmem-limited. That will have to hold set_limit_mutex as well. 3105 */ 3106 int memcg_update_cache_sizes(struct mem_cgroup *memcg) 3107 { 3108 int num, ret; 3109 3110 num = ida_simple_get(&kmem_limited_groups, 3111 0, MEMCG_CACHES_MAX_SIZE, GFP_KERNEL); 3112 if (num < 0) 3113 return num; 3114 /* 3115 * After this point, kmem_accounted (that we test atomically in 3116 * the beginning of this conditional), is no longer 0. This 3117 * guarantees only one process will set the following boolean 3118 * to true. We don't need test_and_set because we're protected 3119 * by the set_limit_mutex anyway. 3120 */ 3121 memcg_kmem_set_activated(memcg); 3122 3123 ret = memcg_update_all_caches(num+1); 3124 if (ret) { 3125 ida_simple_remove(&kmem_limited_groups, num); 3126 memcg_kmem_clear_activated(memcg); 3127 return ret; 3128 } 3129 3130 memcg->kmemcg_id = num; 3131 INIT_LIST_HEAD(&memcg->memcg_slab_caches); 3132 mutex_init(&memcg->slab_caches_mutex); 3133 return 0; 3134 } 3135 3136 static size_t memcg_caches_array_size(int num_groups) 3137 { 3138 ssize_t size; 3139 if (num_groups <= 0) 3140 return 0; 3141 3142 size = 2 * num_groups; 3143 if (size < MEMCG_CACHES_MIN_SIZE) 3144 size = MEMCG_CACHES_MIN_SIZE; 3145 else if (size > MEMCG_CACHES_MAX_SIZE) 3146 size = MEMCG_CACHES_MAX_SIZE; 3147 3148 return size; 3149 } 3150 3151 /* 3152 * We should update the current array size iff all caches updates succeed. This 3153 * can only be done from the slab side. The slab mutex needs to be held when 3154 * calling this. 3155 */ 3156 void memcg_update_array_size(int num) 3157 { 3158 if (num > memcg_limited_groups_array_size) 3159 memcg_limited_groups_array_size = memcg_caches_array_size(num); 3160 } 3161 3162 static void kmem_cache_destroy_work_func(struct work_struct *w); 3163 3164 int memcg_update_cache_size(struct kmem_cache *s, int num_groups) 3165 { 3166 struct memcg_cache_params *cur_params = s->memcg_params; 3167 3168 VM_BUG_ON(!is_root_cache(s)); 3169 3170 if (num_groups > memcg_limited_groups_array_size) { 3171 int i; 3172 ssize_t size = memcg_caches_array_size(num_groups); 3173 3174 size *= sizeof(void *); 3175 size += offsetof(struct memcg_cache_params, memcg_caches); 3176 3177 s->memcg_params = kzalloc(size, GFP_KERNEL); 3178 if (!s->memcg_params) { 3179 s->memcg_params = cur_params; 3180 return -ENOMEM; 3181 } 3182 3183 s->memcg_params->is_root_cache = true; 3184 3185 /* 3186 * There is the chance it will be bigger than 3187 * memcg_limited_groups_array_size, if we failed an allocation 3188 * in a cache, in which case all caches updated before it, will 3189 * have a bigger array. 3190 * 3191 * But if that is the case, the data after 3192 * memcg_limited_groups_array_size is certainly unused 3193 */ 3194 for (i = 0; i < memcg_limited_groups_array_size; i++) { 3195 if (!cur_params->memcg_caches[i]) 3196 continue; 3197 s->memcg_params->memcg_caches[i] = 3198 cur_params->memcg_caches[i]; 3199 } 3200 3201 /* 3202 * Ideally, we would wait until all caches succeed, and only 3203 * then free the old one. But this is not worth the extra 3204 * pointer per-cache we'd have to have for this. 3205 * 3206 * It is not a big deal if some caches are left with a size 3207 * bigger than the others. And all updates will reset this 3208 * anyway. 3209 */ 3210 kfree(cur_params); 3211 } 3212 return 0; 3213 } 3214 3215 int memcg_register_cache(struct mem_cgroup *memcg, struct kmem_cache *s, 3216 struct kmem_cache *root_cache) 3217 { 3218 size_t size; 3219 3220 if (!memcg_kmem_enabled()) 3221 return 0; 3222 3223 if (!memcg) { 3224 size = offsetof(struct memcg_cache_params, memcg_caches); 3225 size += memcg_limited_groups_array_size * sizeof(void *); 3226 } else 3227 size = sizeof(struct memcg_cache_params); 3228 3229 s->memcg_params = kzalloc(size, GFP_KERNEL); 3230 if (!s->memcg_params) 3231 return -ENOMEM; 3232 3233 if (memcg) { 3234 s->memcg_params->memcg = memcg; 3235 s->memcg_params->root_cache = root_cache; 3236 INIT_WORK(&s->memcg_params->destroy, 3237 kmem_cache_destroy_work_func); 3238 } else 3239 s->memcg_params->is_root_cache = true; 3240 3241 return 0; 3242 } 3243 3244 void memcg_release_cache(struct kmem_cache *s) 3245 { 3246 struct kmem_cache *root; 3247 struct mem_cgroup *memcg; 3248 int id; 3249 3250 /* 3251 * This happens, for instance, when a root cache goes away before we 3252 * add any memcg. 3253 */ 3254 if (!s->memcg_params) 3255 return; 3256 3257 if (s->memcg_params->is_root_cache) 3258 goto out; 3259 3260 memcg = s->memcg_params->memcg; 3261 id = memcg_cache_id(memcg); 3262 3263 root = s->memcg_params->root_cache; 3264 root->memcg_params->memcg_caches[id] = NULL; 3265 3266 mutex_lock(&memcg->slab_caches_mutex); 3267 list_del(&s->memcg_params->list); 3268 mutex_unlock(&memcg->slab_caches_mutex); 3269 3270 css_put(&memcg->css); 3271 out: 3272 kfree(s->memcg_params); 3273 } 3274 3275 /* 3276 * During the creation a new cache, we need to disable our accounting mechanism 3277 * altogether. This is true even if we are not creating, but rather just 3278 * enqueing new caches to be created. 3279 * 3280 * This is because that process will trigger allocations; some visible, like 3281 * explicit kmallocs to auxiliary data structures, name strings and internal 3282 * cache structures; some well concealed, like INIT_WORK() that can allocate 3283 * objects during debug. 3284 * 3285 * If any allocation happens during memcg_kmem_get_cache, we will recurse back 3286 * to it. This may not be a bounded recursion: since the first cache creation 3287 * failed to complete (waiting on the allocation), we'll just try to create the 3288 * cache again, failing at the same point. 3289 * 3290 * memcg_kmem_get_cache is prepared to abort after seeing a positive count of 3291 * memcg_kmem_skip_account. So we enclose anything that might allocate memory 3292 * inside the following two functions. 3293 */ 3294 static inline void memcg_stop_kmem_account(void) 3295 { 3296 VM_BUG_ON(!current->mm); 3297 current->memcg_kmem_skip_account++; 3298 } 3299 3300 static inline void memcg_resume_kmem_account(void) 3301 { 3302 VM_BUG_ON(!current->mm); 3303 current->memcg_kmem_skip_account--; 3304 } 3305 3306 static void kmem_cache_destroy_work_func(struct work_struct *w) 3307 { 3308 struct kmem_cache *cachep; 3309 struct memcg_cache_params *p; 3310 3311 p = container_of(w, struct memcg_cache_params, destroy); 3312 3313 cachep = memcg_params_to_cache(p); 3314 3315 /* 3316 * If we get down to 0 after shrink, we could delete right away. 3317 * However, memcg_release_pages() already puts us back in the workqueue 3318 * in that case. If we proceed deleting, we'll get a dangling 3319 * reference, and removing the object from the workqueue in that case 3320 * is unnecessary complication. We are not a fast path. 3321 * 3322 * Note that this case is fundamentally different from racing with 3323 * shrink_slab(): if memcg_cgroup_destroy_cache() is called in 3324 * kmem_cache_shrink, not only we would be reinserting a dead cache 3325 * into the queue, but doing so from inside the worker racing to 3326 * destroy it. 3327 * 3328 * So if we aren't down to zero, we'll just schedule a worker and try 3329 * again 3330 */ 3331 if (atomic_read(&cachep->memcg_params->nr_pages) != 0) { 3332 kmem_cache_shrink(cachep); 3333 if (atomic_read(&cachep->memcg_params->nr_pages) == 0) 3334 return; 3335 } else 3336 kmem_cache_destroy(cachep); 3337 } 3338 3339 void mem_cgroup_destroy_cache(struct kmem_cache *cachep) 3340 { 3341 if (!cachep->memcg_params->dead) 3342 return; 3343 3344 /* 3345 * There are many ways in which we can get here. 3346 * 3347 * We can get to a memory-pressure situation while the delayed work is 3348 * still pending to run. The vmscan shrinkers can then release all 3349 * cache memory and get us to destruction. If this is the case, we'll 3350 * be executed twice, which is a bug (the second time will execute over 3351 * bogus data). In this case, cancelling the work should be fine. 3352 * 3353 * But we can also get here from the worker itself, if 3354 * kmem_cache_shrink is enough to shake all the remaining objects and 3355 * get the page count to 0. In this case, we'll deadlock if we try to 3356 * cancel the work (the worker runs with an internal lock held, which 3357 * is the same lock we would hold for cancel_work_sync().) 3358 * 3359 * Since we can't possibly know who got us here, just refrain from 3360 * running if there is already work pending 3361 */ 3362 if (work_pending(&cachep->memcg_params->destroy)) 3363 return; 3364 /* 3365 * We have to defer the actual destroying to a workqueue, because 3366 * we might currently be in a context that cannot sleep. 3367 */ 3368 schedule_work(&cachep->memcg_params->destroy); 3369 } 3370 3371 /* 3372 * This lock protects updaters, not readers. We want readers to be as fast as 3373 * they can, and they will either see NULL or a valid cache value. Our model 3374 * allow them to see NULL, in which case the root memcg will be selected. 3375 * 3376 * We need this lock because multiple allocations to the same cache from a non 3377 * will span more than one worker. Only one of them can create the cache. 3378 */ 3379 static DEFINE_MUTEX(memcg_cache_mutex); 3380 3381 /* 3382 * Called with memcg_cache_mutex held 3383 */ 3384 static struct kmem_cache *kmem_cache_dup(struct mem_cgroup *memcg, 3385 struct kmem_cache *s) 3386 { 3387 struct kmem_cache *new; 3388 static char *tmp_name = NULL; 3389 3390 lockdep_assert_held(&memcg_cache_mutex); 3391 3392 /* 3393 * kmem_cache_create_memcg duplicates the given name and 3394 * cgroup_name for this name requires RCU context. 3395 * This static temporary buffer is used to prevent from 3396 * pointless shortliving allocation. 3397 */ 3398 if (!tmp_name) { 3399 tmp_name = kmalloc(PATH_MAX, GFP_KERNEL); 3400 if (!tmp_name) 3401 return NULL; 3402 } 3403 3404 rcu_read_lock(); 3405 snprintf(tmp_name, PATH_MAX, "%s(%d:%s)", s->name, 3406 memcg_cache_id(memcg), cgroup_name(memcg->css.cgroup)); 3407 rcu_read_unlock(); 3408 3409 new = kmem_cache_create_memcg(memcg, tmp_name, s->object_size, s->align, 3410 (s->flags & ~SLAB_PANIC), s->ctor, s); 3411 3412 if (new) 3413 new->allocflags |= __GFP_KMEMCG; 3414 3415 return new; 3416 } 3417 3418 static struct kmem_cache *memcg_create_kmem_cache(struct mem_cgroup *memcg, 3419 struct kmem_cache *cachep) 3420 { 3421 struct kmem_cache *new_cachep; 3422 int idx; 3423 3424 BUG_ON(!memcg_can_account_kmem(memcg)); 3425 3426 idx = memcg_cache_id(memcg); 3427 3428 mutex_lock(&memcg_cache_mutex); 3429 new_cachep = cache_from_memcg_idx(cachep, idx); 3430 if (new_cachep) { 3431 css_put(&memcg->css); 3432 goto out; 3433 } 3434 3435 new_cachep = kmem_cache_dup(memcg, cachep); 3436 if (new_cachep == NULL) { 3437 new_cachep = cachep; 3438 css_put(&memcg->css); 3439 goto out; 3440 } 3441 3442 atomic_set(&new_cachep->memcg_params->nr_pages , 0); 3443 3444 cachep->memcg_params->memcg_caches[idx] = new_cachep; 3445 /* 3446 * the readers won't lock, make sure everybody sees the updated value, 3447 * so they won't put stuff in the queue again for no reason 3448 */ 3449 wmb(); 3450 out: 3451 mutex_unlock(&memcg_cache_mutex); 3452 return new_cachep; 3453 } 3454 3455 void kmem_cache_destroy_memcg_children(struct kmem_cache *s) 3456 { 3457 struct kmem_cache *c; 3458 int i; 3459 3460 if (!s->memcg_params) 3461 return; 3462 if (!s->memcg_params->is_root_cache) 3463 return; 3464 3465 /* 3466 * If the cache is being destroyed, we trust that there is no one else 3467 * requesting objects from it. Even if there are, the sanity checks in 3468 * kmem_cache_destroy should caught this ill-case. 3469 * 3470 * Still, we don't want anyone else freeing memcg_caches under our 3471 * noses, which can happen if a new memcg comes to life. As usual, 3472 * we'll take the set_limit_mutex to protect ourselves against this. 3473 */ 3474 mutex_lock(&set_limit_mutex); 3475 for_each_memcg_cache_index(i) { 3476 c = cache_from_memcg_idx(s, i); 3477 if (!c) 3478 continue; 3479 3480 /* 3481 * We will now manually delete the caches, so to avoid races 3482 * we need to cancel all pending destruction workers and 3483 * proceed with destruction ourselves. 3484 * 3485 * kmem_cache_destroy() will call kmem_cache_shrink internally, 3486 * and that could spawn the workers again: it is likely that 3487 * the cache still have active pages until this very moment. 3488 * This would lead us back to mem_cgroup_destroy_cache. 3489 * 3490 * But that will not execute at all if the "dead" flag is not 3491 * set, so flip it down to guarantee we are in control. 3492 */ 3493 c->memcg_params->dead = false; 3494 cancel_work_sync(&c->memcg_params->destroy); 3495 kmem_cache_destroy(c); 3496 } 3497 mutex_unlock(&set_limit_mutex); 3498 } 3499 3500 struct create_work { 3501 struct mem_cgroup *memcg; 3502 struct kmem_cache *cachep; 3503 struct work_struct work; 3504 }; 3505 3506 static void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3507 { 3508 struct kmem_cache *cachep; 3509 struct memcg_cache_params *params; 3510 3511 if (!memcg_kmem_is_active(memcg)) 3512 return; 3513 3514 mutex_lock(&memcg->slab_caches_mutex); 3515 list_for_each_entry(params, &memcg->memcg_slab_caches, list) { 3516 cachep = memcg_params_to_cache(params); 3517 cachep->memcg_params->dead = true; 3518 schedule_work(&cachep->memcg_params->destroy); 3519 } 3520 mutex_unlock(&memcg->slab_caches_mutex); 3521 } 3522 3523 static void memcg_create_cache_work_func(struct work_struct *w) 3524 { 3525 struct create_work *cw; 3526 3527 cw = container_of(w, struct create_work, work); 3528 memcg_create_kmem_cache(cw->memcg, cw->cachep); 3529 kfree(cw); 3530 } 3531 3532 /* 3533 * Enqueue the creation of a per-memcg kmem_cache. 3534 */ 3535 static void __memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3536 struct kmem_cache *cachep) 3537 { 3538 struct create_work *cw; 3539 3540 cw = kmalloc(sizeof(struct create_work), GFP_NOWAIT); 3541 if (cw == NULL) { 3542 css_put(&memcg->css); 3543 return; 3544 } 3545 3546 cw->memcg = memcg; 3547 cw->cachep = cachep; 3548 3549 INIT_WORK(&cw->work, memcg_create_cache_work_func); 3550 schedule_work(&cw->work); 3551 } 3552 3553 static void memcg_create_cache_enqueue(struct mem_cgroup *memcg, 3554 struct kmem_cache *cachep) 3555 { 3556 /* 3557 * We need to stop accounting when we kmalloc, because if the 3558 * corresponding kmalloc cache is not yet created, the first allocation 3559 * in __memcg_create_cache_enqueue will recurse. 3560 * 3561 * However, it is better to enclose the whole function. Depending on 3562 * the debugging options enabled, INIT_WORK(), for instance, can 3563 * trigger an allocation. This too, will make us recurse. Because at 3564 * this point we can't allow ourselves back into memcg_kmem_get_cache, 3565 * the safest choice is to do it like this, wrapping the whole function. 3566 */ 3567 memcg_stop_kmem_account(); 3568 __memcg_create_cache_enqueue(memcg, cachep); 3569 memcg_resume_kmem_account(); 3570 } 3571 /* 3572 * Return the kmem_cache we're supposed to use for a slab allocation. 3573 * We try to use the current memcg's version of the cache. 3574 * 3575 * If the cache does not exist yet, if we are the first user of it, 3576 * we either create it immediately, if possible, or create it asynchronously 3577 * in a workqueue. 3578 * In the latter case, we will let the current allocation go through with 3579 * the original cache. 3580 * 3581 * Can't be called in interrupt context or from kernel threads. 3582 * This function needs to be called with rcu_read_lock() held. 3583 */ 3584 struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, 3585 gfp_t gfp) 3586 { 3587 struct mem_cgroup *memcg; 3588 int idx; 3589 3590 VM_BUG_ON(!cachep->memcg_params); 3591 VM_BUG_ON(!cachep->memcg_params->is_root_cache); 3592 3593 if (!current->mm || current->memcg_kmem_skip_account) 3594 return cachep; 3595 3596 rcu_read_lock(); 3597 memcg = mem_cgroup_from_task(rcu_dereference(current->mm->owner)); 3598 3599 if (!memcg_can_account_kmem(memcg)) 3600 goto out; 3601 3602 idx = memcg_cache_id(memcg); 3603 3604 /* 3605 * barrier to mare sure we're always seeing the up to date value. The 3606 * code updating memcg_caches will issue a write barrier to match this. 3607 */ 3608 read_barrier_depends(); 3609 if (likely(cache_from_memcg_idx(cachep, idx))) { 3610 cachep = cache_from_memcg_idx(cachep, idx); 3611 goto out; 3612 } 3613 3614 /* The corresponding put will be done in the workqueue. */ 3615 if (!css_tryget(&memcg->css)) 3616 goto out; 3617 rcu_read_unlock(); 3618 3619 /* 3620 * If we are in a safe context (can wait, and not in interrupt 3621 * context), we could be be predictable and return right away. 3622 * This would guarantee that the allocation being performed 3623 * already belongs in the new cache. 3624 * 3625 * However, there are some clashes that can arrive from locking. 3626 * For instance, because we acquire the slab_mutex while doing 3627 * kmem_cache_dup, this means no further allocation could happen 3628 * with the slab_mutex held. 3629 * 3630 * Also, because cache creation issue get_online_cpus(), this 3631 * creates a lock chain: memcg_slab_mutex -> cpu_hotplug_mutex, 3632 * that ends up reversed during cpu hotplug. (cpuset allocates 3633 * a bunch of GFP_KERNEL memory during cpuup). Due to all that, 3634 * better to defer everything. 3635 */ 3636 memcg_create_cache_enqueue(memcg, cachep); 3637 return cachep; 3638 out: 3639 rcu_read_unlock(); 3640 return cachep; 3641 } 3642 EXPORT_SYMBOL(__memcg_kmem_get_cache); 3643 3644 /* 3645 * We need to verify if the allocation against current->mm->owner's memcg is 3646 * possible for the given order. But the page is not allocated yet, so we'll 3647 * need a further commit step to do the final arrangements. 3648 * 3649 * It is possible for the task to switch cgroups in this mean time, so at 3650 * commit time, we can't rely on task conversion any longer. We'll then use 3651 * the handle argument to return to the caller which cgroup we should commit 3652 * against. We could also return the memcg directly and avoid the pointer 3653 * passing, but a boolean return value gives better semantics considering 3654 * the compiled-out case as well. 3655 * 3656 * Returning true means the allocation is possible. 3657 */ 3658 bool 3659 __memcg_kmem_newpage_charge(gfp_t gfp, struct mem_cgroup **_memcg, int order) 3660 { 3661 struct mem_cgroup *memcg; 3662 int ret; 3663 3664 *_memcg = NULL; 3665 3666 /* 3667 * Disabling accounting is only relevant for some specific memcg 3668 * internal allocations. Therefore we would initially not have such 3669 * check here, since direct calls to the page allocator that are marked 3670 * with GFP_KMEMCG only happen outside memcg core. We are mostly 3671 * concerned with cache allocations, and by having this test at 3672 * memcg_kmem_get_cache, we are already able to relay the allocation to 3673 * the root cache and bypass the memcg cache altogether. 3674 * 3675 * There is one exception, though: the SLUB allocator does not create 3676 * large order caches, but rather service large kmallocs directly from 3677 * the page allocator. Therefore, the following sequence when backed by 3678 * the SLUB allocator: 3679 * 3680 * memcg_stop_kmem_account(); 3681 * kmalloc(<large_number>) 3682 * memcg_resume_kmem_account(); 3683 * 3684 * would effectively ignore the fact that we should skip accounting, 3685 * since it will drive us directly to this function without passing 3686 * through the cache selector memcg_kmem_get_cache. Such large 3687 * allocations are extremely rare but can happen, for instance, for the 3688 * cache arrays. We bring this test here. 3689 */ 3690 if (!current->mm || current->memcg_kmem_skip_account) 3691 return true; 3692 3693 memcg = try_get_mem_cgroup_from_mm(current->mm); 3694 3695 /* 3696 * very rare case described in mem_cgroup_from_task. Unfortunately there 3697 * isn't much we can do without complicating this too much, and it would 3698 * be gfp-dependent anyway. Just let it go 3699 */ 3700 if (unlikely(!memcg)) 3701 return true; 3702 3703 if (!memcg_can_account_kmem(memcg)) { 3704 css_put(&memcg->css); 3705 return true; 3706 } 3707 3708 ret = memcg_charge_kmem(memcg, gfp, PAGE_SIZE << order); 3709 if (!ret) 3710 *_memcg = memcg; 3711 3712 css_put(&memcg->css); 3713 return (ret == 0); 3714 } 3715 3716 void __memcg_kmem_commit_charge(struct page *page, struct mem_cgroup *memcg, 3717 int order) 3718 { 3719 struct page_cgroup *pc; 3720 3721 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3722 3723 /* The page allocation failed. Revert */ 3724 if (!page) { 3725 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3726 return; 3727 } 3728 3729 pc = lookup_page_cgroup(page); 3730 lock_page_cgroup(pc); 3731 pc->mem_cgroup = memcg; 3732 SetPageCgroupUsed(pc); 3733 unlock_page_cgroup(pc); 3734 } 3735 3736 void __memcg_kmem_uncharge_pages(struct page *page, int order) 3737 { 3738 struct mem_cgroup *memcg = NULL; 3739 struct page_cgroup *pc; 3740 3741 3742 pc = lookup_page_cgroup(page); 3743 /* 3744 * Fast unlocked return. Theoretically might have changed, have to 3745 * check again after locking. 3746 */ 3747 if (!PageCgroupUsed(pc)) 3748 return; 3749 3750 lock_page_cgroup(pc); 3751 if (PageCgroupUsed(pc)) { 3752 memcg = pc->mem_cgroup; 3753 ClearPageCgroupUsed(pc); 3754 } 3755 unlock_page_cgroup(pc); 3756 3757 /* 3758 * We trust that only if there is a memcg associated with the page, it 3759 * is a valid allocation 3760 */ 3761 if (!memcg) 3762 return; 3763 3764 VM_BUG_ON(mem_cgroup_is_root(memcg)); 3765 memcg_uncharge_kmem(memcg, PAGE_SIZE << order); 3766 } 3767 #else 3768 static inline void mem_cgroup_destroy_all_caches(struct mem_cgroup *memcg) 3769 { 3770 } 3771 #endif /* CONFIG_MEMCG_KMEM */ 3772 3773 #ifdef CONFIG_TRANSPARENT_HUGEPAGE 3774 3775 #define PCGF_NOCOPY_AT_SPLIT (1 << PCG_LOCK | 1 << PCG_MIGRATION) 3776 /* 3777 * Because tail pages are not marked as "used", set it. We're under 3778 * zone->lru_lock, 'splitting on pmd' and compound_lock. 3779 * charge/uncharge will be never happen and move_account() is done under 3780 * compound_lock(), so we don't have to take care of races. 3781 */ 3782 void mem_cgroup_split_huge_fixup(struct page *head) 3783 { 3784 struct page_cgroup *head_pc = lookup_page_cgroup(head); 3785 struct page_cgroup *pc; 3786 struct mem_cgroup *memcg; 3787 int i; 3788 3789 if (mem_cgroup_disabled()) 3790 return; 3791 3792 memcg = head_pc->mem_cgroup; 3793 for (i = 1; i < HPAGE_PMD_NR; i++) { 3794 pc = head_pc + i; 3795 pc->mem_cgroup = memcg; 3796 smp_wmb();/* see __commit_charge() */ 3797 pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; 3798 } 3799 __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], 3800 HPAGE_PMD_NR); 3801 } 3802 #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ 3803 3804 static inline 3805 void mem_cgroup_move_account_page_stat(struct mem_cgroup *from, 3806 struct mem_cgroup *to, 3807 unsigned int nr_pages, 3808 enum mem_cgroup_stat_index idx) 3809 { 3810 /* Update stat data for mem_cgroup */ 3811 preempt_disable(); 3812 __this_cpu_sub(from->stat->count[idx], nr_pages); 3813 __this_cpu_add(to->stat->count[idx], nr_pages); 3814 preempt_enable(); 3815 } 3816 3817 /** 3818 * mem_cgroup_move_account - move account of the page 3819 * @page: the page 3820 * @nr_pages: number of regular pages (>1 for huge pages) 3821 * @pc: page_cgroup of the page. 3822 * @from: mem_cgroup which the page is moved from. 3823 * @to: mem_cgroup which the page is moved to. @from != @to. 3824 * 3825 * The caller must confirm following. 3826 * - page is not on LRU (isolate_page() is useful.) 3827 * - compound_lock is held when nr_pages > 1 3828 * 3829 * This function doesn't do "charge" to new cgroup and doesn't do "uncharge" 3830 * from old cgroup. 3831 */ 3832 static int mem_cgroup_move_account(struct page *page, 3833 unsigned int nr_pages, 3834 struct page_cgroup *pc, 3835 struct mem_cgroup *from, 3836 struct mem_cgroup *to) 3837 { 3838 unsigned long flags; 3839 int ret; 3840 bool anon = PageAnon(page); 3841 3842 VM_BUG_ON(from == to); 3843 VM_BUG_ON(PageLRU(page)); 3844 /* 3845 * The page is isolated from LRU. So, collapse function 3846 * will not handle this page. But page splitting can happen. 3847 * Do this check under compound_page_lock(). The caller should 3848 * hold it. 3849 */ 3850 ret = -EBUSY; 3851 if (nr_pages > 1 && !PageTransHuge(page)) 3852 goto out; 3853 3854 lock_page_cgroup(pc); 3855 3856 ret = -EINVAL; 3857 if (!PageCgroupUsed(pc) || pc->mem_cgroup != from) 3858 goto unlock; 3859 3860 move_lock_mem_cgroup(from, &flags); 3861 3862 if (!anon && page_mapped(page)) 3863 mem_cgroup_move_account_page_stat(from, to, nr_pages, 3864 MEM_CGROUP_STAT_FILE_MAPPED); 3865 3866 if (PageWriteback(page)) 3867 mem_cgroup_move_account_page_stat(from, to, nr_pages, 3868 MEM_CGROUP_STAT_WRITEBACK); 3869 3870 mem_cgroup_charge_statistics(from, page, anon, -nr_pages); 3871 3872 /* caller should have done css_get */ 3873 pc->mem_cgroup = to; 3874 mem_cgroup_charge_statistics(to, page, anon, nr_pages); 3875 move_unlock_mem_cgroup(from, &flags); 3876 ret = 0; 3877 unlock: 3878 unlock_page_cgroup(pc); 3879 /* 3880 * check events 3881 */ 3882 memcg_check_events(to, page); 3883 memcg_check_events(from, page); 3884 out: 3885 return ret; 3886 } 3887 3888 /** 3889 * mem_cgroup_move_parent - moves page to the parent group 3890 * @page: the page to move 3891 * @pc: page_cgroup of the page 3892 * @child: page's cgroup 3893 * 3894 * move charges to its parent or the root cgroup if the group has no 3895 * parent (aka use_hierarchy==0). 3896 * Although this might fail (get_page_unless_zero, isolate_lru_page or 3897 * mem_cgroup_move_account fails) the failure is always temporary and 3898 * it signals a race with a page removal/uncharge or migration. In the 3899 * first case the page is on the way out and it will vanish from the LRU 3900 * on the next attempt and the call should be retried later. 3901 * Isolation from the LRU fails only if page has been isolated from 3902 * the LRU since we looked at it and that usually means either global 3903 * reclaim or migration going on. The page will either get back to the 3904 * LRU or vanish. 3905 * Finaly mem_cgroup_move_account fails only if the page got uncharged 3906 * (!PageCgroupUsed) or moved to a different group. The page will 3907 * disappear in the next attempt. 3908 */ 3909 static int mem_cgroup_move_parent(struct page *page, 3910 struct page_cgroup *pc, 3911 struct mem_cgroup *child) 3912 { 3913 struct mem_cgroup *parent; 3914 unsigned int nr_pages; 3915 unsigned long uninitialized_var(flags); 3916 int ret; 3917 3918 VM_BUG_ON(mem_cgroup_is_root(child)); 3919 3920 ret = -EBUSY; 3921 if (!get_page_unless_zero(page)) 3922 goto out; 3923 if (isolate_lru_page(page)) 3924 goto put; 3925 3926 nr_pages = hpage_nr_pages(page); 3927 3928 parent = parent_mem_cgroup(child); 3929 /* 3930 * If no parent, move charges to root cgroup. 3931 */ 3932 if (!parent) 3933 parent = root_mem_cgroup; 3934 3935 if (nr_pages > 1) { 3936 VM_BUG_ON(!PageTransHuge(page)); 3937 flags = compound_lock_irqsave(page); 3938 } 3939 3940 ret = mem_cgroup_move_account(page, nr_pages, 3941 pc, child, parent); 3942 if (!ret) 3943 __mem_cgroup_cancel_local_charge(child, nr_pages); 3944 3945 if (nr_pages > 1) 3946 compound_unlock_irqrestore(page, flags); 3947 putback_lru_page(page); 3948 put: 3949 put_page(page); 3950 out: 3951 return ret; 3952 } 3953 3954 /* 3955 * Charge the memory controller for page usage. 3956 * Return 3957 * 0 if the charge was successful 3958 * < 0 if the cgroup is over its limit 3959 */ 3960 static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, 3961 gfp_t gfp_mask, enum charge_type ctype) 3962 { 3963 struct mem_cgroup *memcg = NULL; 3964 unsigned int nr_pages = 1; 3965 bool oom = true; 3966 int ret; 3967 3968 if (PageTransHuge(page)) { 3969 nr_pages <<= compound_order(page); 3970 VM_BUG_ON(!PageTransHuge(page)); 3971 /* 3972 * Never OOM-kill a process for a huge page. The 3973 * fault handler will fall back to regular pages. 3974 */ 3975 oom = false; 3976 } 3977 3978 ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, &memcg, oom); 3979 if (ret == -ENOMEM) 3980 return ret; 3981 __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); 3982 return 0; 3983 } 3984 3985 int mem_cgroup_newpage_charge(struct page *page, 3986 struct mm_struct *mm, gfp_t gfp_mask) 3987 { 3988 if (mem_cgroup_disabled()) 3989 return 0; 3990 VM_BUG_ON(page_mapped(page)); 3991 VM_BUG_ON(page->mapping && !PageAnon(page)); 3992 VM_BUG_ON(!mm); 3993 return mem_cgroup_charge_common(page, mm, gfp_mask, 3994 MEM_CGROUP_CHARGE_TYPE_ANON); 3995 } 3996 3997 /* 3998 * While swap-in, try_charge -> commit or cancel, the page is locked. 3999 * And when try_charge() successfully returns, one refcnt to memcg without 4000 * struct page_cgroup is acquired. This refcnt will be consumed by 4001 * "commit()" or removed by "cancel()" 4002 */ 4003 static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, 4004 struct page *page, 4005 gfp_t mask, 4006 struct mem_cgroup **memcgp) 4007 { 4008 struct mem_cgroup *memcg; 4009 struct page_cgroup *pc; 4010 int ret; 4011 4012 pc = lookup_page_cgroup(page); 4013 /* 4014 * Every swap fault against a single page tries to charge the 4015 * page, bail as early as possible. shmem_unuse() encounters 4016 * already charged pages, too. The USED bit is protected by 4017 * the page lock, which serializes swap cache removal, which 4018 * in turn serializes uncharging. 4019 */ 4020 if (PageCgroupUsed(pc)) 4021 return 0; 4022 if (!do_swap_account) 4023 goto charge_cur_mm; 4024 memcg = try_get_mem_cgroup_from_page(page); 4025 if (!memcg) 4026 goto charge_cur_mm; 4027 *memcgp = memcg; 4028 ret = __mem_cgroup_try_charge(NULL, mask, 1, memcgp, true); 4029 css_put(&memcg->css); 4030 if (ret == -EINTR) 4031 ret = 0; 4032 return ret; 4033 charge_cur_mm: 4034 ret = __mem_cgroup_try_charge(mm, mask, 1, memcgp, true); 4035 if (ret == -EINTR) 4036 ret = 0; 4037 return ret; 4038 } 4039 4040 int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, 4041 gfp_t gfp_mask, struct mem_cgroup **memcgp) 4042 { 4043 *memcgp = NULL; 4044 if (mem_cgroup_disabled()) 4045 return 0; 4046 /* 4047 * A racing thread's fault, or swapoff, may have already 4048 * updated the pte, and even removed page from swap cache: in 4049 * those cases unuse_pte()'s pte_same() test will fail; but 4050 * there's also a KSM case which does need to charge the page. 4051 */ 4052 if (!PageSwapCache(page)) { 4053 int ret; 4054 4055 ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, memcgp, true); 4056 if (ret == -EINTR) 4057 ret = 0; 4058 return ret; 4059 } 4060 return __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, memcgp); 4061 } 4062 4063 void mem_cgroup_cancel_charge_swapin(struct mem_cgroup *memcg) 4064 { 4065 if (mem_cgroup_disabled()) 4066 return; 4067 if (!memcg) 4068 return; 4069 __mem_cgroup_cancel_charge(memcg, 1); 4070 } 4071 4072 static void 4073 __mem_cgroup_commit_charge_swapin(struct page *page, struct mem_cgroup *memcg, 4074 enum charge_type ctype) 4075 { 4076 if (mem_cgroup_disabled()) 4077 return; 4078 if (!memcg) 4079 return; 4080 4081 __mem_cgroup_commit_charge(memcg, page, 1, ctype, true); 4082 /* 4083 * Now swap is on-memory. This means this page may be 4084 * counted both as mem and swap....double count. 4085 * Fix it by uncharging from memsw. Basically, this SwapCache is stable 4086 * under lock_page(). But in do_swap_page()::memory.c, reuse_swap_page() 4087 * may call delete_from_swap_cache() before reach here. 4088 */ 4089 if (do_swap_account && PageSwapCache(page)) { 4090 swp_entry_t ent = {.val = page_private(page)}; 4091 mem_cgroup_uncharge_swap(ent); 4092 } 4093 } 4094 4095 void mem_cgroup_commit_charge_swapin(struct page *page, 4096 struct mem_cgroup *memcg) 4097 { 4098 __mem_cgroup_commit_charge_swapin(page, memcg, 4099 MEM_CGROUP_CHARGE_TYPE_ANON); 4100 } 4101 4102 int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, 4103 gfp_t gfp_mask) 4104 { 4105 struct mem_cgroup *memcg = NULL; 4106 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4107 int ret; 4108 4109 if (mem_cgroup_disabled()) 4110 return 0; 4111 if (PageCompound(page)) 4112 return 0; 4113 4114 if (!PageSwapCache(page)) 4115 ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); 4116 else { /* page is swapcache/shmem */ 4117 ret = __mem_cgroup_try_charge_swapin(mm, page, 4118 gfp_mask, &memcg); 4119 if (!ret) 4120 __mem_cgroup_commit_charge_swapin(page, memcg, type); 4121 } 4122 return ret; 4123 } 4124 4125 static void mem_cgroup_do_uncharge(struct mem_cgroup *memcg, 4126 unsigned int nr_pages, 4127 const enum charge_type ctype) 4128 { 4129 struct memcg_batch_info *batch = NULL; 4130 bool uncharge_memsw = true; 4131 4132 /* If swapout, usage of swap doesn't decrease */ 4133 if (!do_swap_account || ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) 4134 uncharge_memsw = false; 4135 4136 batch = ¤t->memcg_batch; 4137 /* 4138 * In usual, we do css_get() when we remember memcg pointer. 4139 * But in this case, we keep res->usage until end of a series of 4140 * uncharges. Then, it's ok to ignore memcg's refcnt. 4141 */ 4142 if (!batch->memcg) 4143 batch->memcg = memcg; 4144 /* 4145 * do_batch > 0 when unmapping pages or inode invalidate/truncate. 4146 * In those cases, all pages freed continuously can be expected to be in 4147 * the same cgroup and we have chance to coalesce uncharges. 4148 * But we do uncharge one by one if this is killed by OOM(TIF_MEMDIE) 4149 * because we want to do uncharge as soon as possible. 4150 */ 4151 4152 if (!batch->do_batch || test_thread_flag(TIF_MEMDIE)) 4153 goto direct_uncharge; 4154 4155 if (nr_pages > 1) 4156 goto direct_uncharge; 4157 4158 /* 4159 * In typical case, batch->memcg == mem. This means we can 4160 * merge a series of uncharges to an uncharge of res_counter. 4161 * If not, we uncharge res_counter ony by one. 4162 */ 4163 if (batch->memcg != memcg) 4164 goto direct_uncharge; 4165 /* remember freed charge and uncharge it later */ 4166 batch->nr_pages++; 4167 if (uncharge_memsw) 4168 batch->memsw_nr_pages++; 4169 return; 4170 direct_uncharge: 4171 res_counter_uncharge(&memcg->res, nr_pages * PAGE_SIZE); 4172 if (uncharge_memsw) 4173 res_counter_uncharge(&memcg->memsw, nr_pages * PAGE_SIZE); 4174 if (unlikely(batch->memcg != memcg)) 4175 memcg_oom_recover(memcg); 4176 } 4177 4178 /* 4179 * uncharge if !page_mapped(page) 4180 */ 4181 static struct mem_cgroup * 4182 __mem_cgroup_uncharge_common(struct page *page, enum charge_type ctype, 4183 bool end_migration) 4184 { 4185 struct mem_cgroup *memcg = NULL; 4186 unsigned int nr_pages = 1; 4187 struct page_cgroup *pc; 4188 bool anon; 4189 4190 if (mem_cgroup_disabled()) 4191 return NULL; 4192 4193 if (PageTransHuge(page)) { 4194 nr_pages <<= compound_order(page); 4195 VM_BUG_ON(!PageTransHuge(page)); 4196 } 4197 /* 4198 * Check if our page_cgroup is valid 4199 */ 4200 pc = lookup_page_cgroup(page); 4201 if (unlikely(!PageCgroupUsed(pc))) 4202 return NULL; 4203 4204 lock_page_cgroup(pc); 4205 4206 memcg = pc->mem_cgroup; 4207 4208 if (!PageCgroupUsed(pc)) 4209 goto unlock_out; 4210 4211 anon = PageAnon(page); 4212 4213 switch (ctype) { 4214 case MEM_CGROUP_CHARGE_TYPE_ANON: 4215 /* 4216 * Generally PageAnon tells if it's the anon statistics to be 4217 * updated; but sometimes e.g. mem_cgroup_uncharge_page() is 4218 * used before page reached the stage of being marked PageAnon. 4219 */ 4220 anon = true; 4221 /* fallthrough */ 4222 case MEM_CGROUP_CHARGE_TYPE_DROP: 4223 /* See mem_cgroup_prepare_migration() */ 4224 if (page_mapped(page)) 4225 goto unlock_out; 4226 /* 4227 * Pages under migration may not be uncharged. But 4228 * end_migration() /must/ be the one uncharging the 4229 * unused post-migration page and so it has to call 4230 * here with the migration bit still set. See the 4231 * res_counter handling below. 4232 */ 4233 if (!end_migration && PageCgroupMigration(pc)) 4234 goto unlock_out; 4235 break; 4236 case MEM_CGROUP_CHARGE_TYPE_SWAPOUT: 4237 if (!PageAnon(page)) { /* Shared memory */ 4238 if (page->mapping && !page_is_file_cache(page)) 4239 goto unlock_out; 4240 } else if (page_mapped(page)) /* Anon */ 4241 goto unlock_out; 4242 break; 4243 default: 4244 break; 4245 } 4246 4247 mem_cgroup_charge_statistics(memcg, page, anon, -nr_pages); 4248 4249 ClearPageCgroupUsed(pc); 4250 /* 4251 * pc->mem_cgroup is not cleared here. It will be accessed when it's 4252 * freed from LRU. This is safe because uncharged page is expected not 4253 * to be reused (freed soon). Exception is SwapCache, it's handled by 4254 * special functions. 4255 */ 4256 4257 unlock_page_cgroup(pc); 4258 /* 4259 * even after unlock, we have memcg->res.usage here and this memcg 4260 * will never be freed, so it's safe to call css_get(). 4261 */ 4262 memcg_check_events(memcg, page); 4263 if (do_swap_account && ctype == MEM_CGROUP_CHARGE_TYPE_SWAPOUT) { 4264 mem_cgroup_swap_statistics(memcg, true); 4265 css_get(&memcg->css); 4266 } 4267 /* 4268 * Migration does not charge the res_counter for the 4269 * replacement page, so leave it alone when phasing out the 4270 * page that is unused after the migration. 4271 */ 4272 if (!end_migration && !mem_cgroup_is_root(memcg)) 4273 mem_cgroup_do_uncharge(memcg, nr_pages, ctype); 4274 4275 return memcg; 4276 4277 unlock_out: 4278 unlock_page_cgroup(pc); 4279 return NULL; 4280 } 4281 4282 void mem_cgroup_uncharge_page(struct page *page) 4283 { 4284 /* early check. */ 4285 if (page_mapped(page)) 4286 return; 4287 VM_BUG_ON(page->mapping && !PageAnon(page)); 4288 /* 4289 * If the page is in swap cache, uncharge should be deferred 4290 * to the swap path, which also properly accounts swap usage 4291 * and handles memcg lifetime. 4292 * 4293 * Note that this check is not stable and reclaim may add the 4294 * page to swap cache at any time after this. However, if the 4295 * page is not in swap cache by the time page->mapcount hits 4296 * 0, there won't be any page table references to the swap 4297 * slot, and reclaim will free it and not actually write the 4298 * page to disk. 4299 */ 4300 if (PageSwapCache(page)) 4301 return; 4302 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_ANON, false); 4303 } 4304 4305 void mem_cgroup_uncharge_cache_page(struct page *page) 4306 { 4307 VM_BUG_ON(page_mapped(page)); 4308 VM_BUG_ON(page->mapping); 4309 __mem_cgroup_uncharge_common(page, MEM_CGROUP_CHARGE_TYPE_CACHE, false); 4310 } 4311 4312 /* 4313 * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. 4314 * In that cases, pages are freed continuously and we can expect pages 4315 * are in the same memcg. All these calls itself limits the number of 4316 * pages freed at once, then uncharge_start/end() is called properly. 4317 * This may be called prural(2) times in a context, 4318 */ 4319 4320 void mem_cgroup_uncharge_start(void) 4321 { 4322 current->memcg_batch.do_batch++; 4323 /* We can do nest. */ 4324 if (current->memcg_batch.do_batch == 1) { 4325 current->memcg_batch.memcg = NULL; 4326 current->memcg_batch.nr_pages = 0; 4327 current->memcg_batch.memsw_nr_pages = 0; 4328 } 4329 } 4330 4331 void mem_cgroup_uncharge_end(void) 4332 { 4333 struct memcg_batch_info *batch = ¤t->memcg_batch; 4334 4335 if (!batch->do_batch) 4336 return; 4337 4338 batch->do_batch--; 4339 if (batch->do_batch) /* If stacked, do nothing. */ 4340 return; 4341 4342 if (!batch->memcg) 4343 return; 4344 /* 4345 * This "batch->memcg" is valid without any css_get/put etc... 4346 * bacause we hide charges behind us. 4347 */ 4348 if (batch->nr_pages) 4349 res_counter_uncharge(&batch->memcg->res, 4350 batch->nr_pages * PAGE_SIZE); 4351 if (batch->memsw_nr_pages) 4352 res_counter_uncharge(&batch->memcg->memsw, 4353 batch->memsw_nr_pages * PAGE_SIZE); 4354 memcg_oom_recover(batch->memcg); 4355 /* forget this pointer (for sanity check) */ 4356 batch->memcg = NULL; 4357 } 4358 4359 #ifdef CONFIG_SWAP 4360 /* 4361 * called after __delete_from_swap_cache() and drop "page" account. 4362 * memcg information is recorded to swap_cgroup of "ent" 4363 */ 4364 void 4365 mem_cgroup_uncharge_swapcache(struct page *page, swp_entry_t ent, bool swapout) 4366 { 4367 struct mem_cgroup *memcg; 4368 int ctype = MEM_CGROUP_CHARGE_TYPE_SWAPOUT; 4369 4370 if (!swapout) /* this was a swap cache but the swap is unused ! */ 4371 ctype = MEM_CGROUP_CHARGE_TYPE_DROP; 4372 4373 memcg = __mem_cgroup_uncharge_common(page, ctype, false); 4374 4375 /* 4376 * record memcg information, if swapout && memcg != NULL, 4377 * css_get() was called in uncharge(). 4378 */ 4379 if (do_swap_account && swapout && memcg) 4380 swap_cgroup_record(ent, mem_cgroup_id(memcg)); 4381 } 4382 #endif 4383 4384 #ifdef CONFIG_MEMCG_SWAP 4385 /* 4386 * called from swap_entry_free(). remove record in swap_cgroup and 4387 * uncharge "memsw" account. 4388 */ 4389 void mem_cgroup_uncharge_swap(swp_entry_t ent) 4390 { 4391 struct mem_cgroup *memcg; 4392 unsigned short id; 4393 4394 if (!do_swap_account) 4395 return; 4396 4397 id = swap_cgroup_record(ent, 0); 4398 rcu_read_lock(); 4399 memcg = mem_cgroup_lookup(id); 4400 if (memcg) { 4401 /* 4402 * We uncharge this because swap is freed. 4403 * This memcg can be obsolete one. We avoid calling css_tryget 4404 */ 4405 if (!mem_cgroup_is_root(memcg)) 4406 res_counter_uncharge(&memcg->memsw, PAGE_SIZE); 4407 mem_cgroup_swap_statistics(memcg, false); 4408 css_put(&memcg->css); 4409 } 4410 rcu_read_unlock(); 4411 } 4412 4413 /** 4414 * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. 4415 * @entry: swap entry to be moved 4416 * @from: mem_cgroup which the entry is moved from 4417 * @to: mem_cgroup which the entry is moved to 4418 * 4419 * It succeeds only when the swap_cgroup's record for this entry is the same 4420 * as the mem_cgroup's id of @from. 4421 * 4422 * Returns 0 on success, -EINVAL on failure. 4423 * 4424 * The caller must have charged to @to, IOW, called res_counter_charge() about 4425 * both res and memsw, and called css_get(). 4426 */ 4427 static int mem_cgroup_move_swap_account(swp_entry_t entry, 4428 struct mem_cgroup *from, struct mem_cgroup *to) 4429 { 4430 unsigned short old_id, new_id; 4431 4432 old_id = mem_cgroup_id(from); 4433 new_id = mem_cgroup_id(to); 4434 4435 if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { 4436 mem_cgroup_swap_statistics(from, false); 4437 mem_cgroup_swap_statistics(to, true); 4438 /* 4439 * This function is only called from task migration context now. 4440 * It postpones res_counter and refcount handling till the end 4441 * of task migration(mem_cgroup_clear_mc()) for performance 4442 * improvement. But we cannot postpone css_get(to) because if 4443 * the process that has been moved to @to does swap-in, the 4444 * refcount of @to might be decreased to 0. 4445 * 4446 * We are in attach() phase, so the cgroup is guaranteed to be 4447 * alive, so we can just call css_get(). 4448 */ 4449 css_get(&to->css); 4450 return 0; 4451 } 4452 return -EINVAL; 4453 } 4454 #else 4455 static inline int mem_cgroup_move_swap_account(swp_entry_t entry, 4456 struct mem_cgroup *from, struct mem_cgroup *to) 4457 { 4458 return -EINVAL; 4459 } 4460 #endif 4461 4462 /* 4463 * Before starting migration, account PAGE_SIZE to mem_cgroup that the old 4464 * page belongs to. 4465 */ 4466 void mem_cgroup_prepare_migration(struct page *page, struct page *newpage, 4467 struct mem_cgroup **memcgp) 4468 { 4469 struct mem_cgroup *memcg = NULL; 4470 unsigned int nr_pages = 1; 4471 struct page_cgroup *pc; 4472 enum charge_type ctype; 4473 4474 *memcgp = NULL; 4475 4476 if (mem_cgroup_disabled()) 4477 return; 4478 4479 if (PageTransHuge(page)) 4480 nr_pages <<= compound_order(page); 4481 4482 pc = lookup_page_cgroup(page); 4483 lock_page_cgroup(pc); 4484 if (PageCgroupUsed(pc)) { 4485 memcg = pc->mem_cgroup; 4486 css_get(&memcg->css); 4487 /* 4488 * At migrating an anonymous page, its mapcount goes down 4489 * to 0 and uncharge() will be called. But, even if it's fully 4490 * unmapped, migration may fail and this page has to be 4491 * charged again. We set MIGRATION flag here and delay uncharge 4492 * until end_migration() is called 4493 * 4494 * Corner Case Thinking 4495 * A) 4496 * When the old page was mapped as Anon and it's unmap-and-freed 4497 * while migration was ongoing. 4498 * If unmap finds the old page, uncharge() of it will be delayed 4499 * until end_migration(). If unmap finds a new page, it's 4500 * uncharged when it make mapcount to be 1->0. If unmap code 4501 * finds swap_migration_entry, the new page will not be mapped 4502 * and end_migration() will find it(mapcount==0). 4503 * 4504 * B) 4505 * When the old page was mapped but migraion fails, the kernel 4506 * remaps it. A charge for it is kept by MIGRATION flag even 4507 * if mapcount goes down to 0. We can do remap successfully 4508 * without charging it again. 4509 * 4510 * C) 4511 * The "old" page is under lock_page() until the end of 4512 * migration, so, the old page itself will not be swapped-out. 4513 * If the new page is swapped out before end_migraton, our 4514 * hook to usual swap-out path will catch the event. 4515 */ 4516 if (PageAnon(page)) 4517 SetPageCgroupMigration(pc); 4518 } 4519 unlock_page_cgroup(pc); 4520 /* 4521 * If the page is not charged at this point, 4522 * we return here. 4523 */ 4524 if (!memcg) 4525 return; 4526 4527 *memcgp = memcg; 4528 /* 4529 * We charge new page before it's used/mapped. So, even if unlock_page() 4530 * is called before end_migration, we can catch all events on this new 4531 * page. In the case new page is migrated but not remapped, new page's 4532 * mapcount will be finally 0 and we call uncharge in end_migration(). 4533 */ 4534 if (PageAnon(page)) 4535 ctype = MEM_CGROUP_CHARGE_TYPE_ANON; 4536 else 4537 ctype = MEM_CGROUP_CHARGE_TYPE_CACHE; 4538 /* 4539 * The page is committed to the memcg, but it's not actually 4540 * charged to the res_counter since we plan on replacing the 4541 * old one and only one page is going to be left afterwards. 4542 */ 4543 __mem_cgroup_commit_charge(memcg, newpage, nr_pages, ctype, false); 4544 } 4545 4546 /* remove redundant charge if migration failed*/ 4547 void mem_cgroup_end_migration(struct mem_cgroup *memcg, 4548 struct page *oldpage, struct page *newpage, bool migration_ok) 4549 { 4550 struct page *used, *unused; 4551 struct page_cgroup *pc; 4552 bool anon; 4553 4554 if (!memcg) 4555 return; 4556 4557 if (!migration_ok) { 4558 used = oldpage; 4559 unused = newpage; 4560 } else { 4561 used = newpage; 4562 unused = oldpage; 4563 } 4564 anon = PageAnon(used); 4565 __mem_cgroup_uncharge_common(unused, 4566 anon ? MEM_CGROUP_CHARGE_TYPE_ANON 4567 : MEM_CGROUP_CHARGE_TYPE_CACHE, 4568 true); 4569 css_put(&memcg->css); 4570 /* 4571 * We disallowed uncharge of pages under migration because mapcount 4572 * of the page goes down to zero, temporarly. 4573 * Clear the flag and check the page should be charged. 4574 */ 4575 pc = lookup_page_cgroup(oldpage); 4576 lock_page_cgroup(pc); 4577 ClearPageCgroupMigration(pc); 4578 unlock_page_cgroup(pc); 4579 4580 /* 4581 * If a page is a file cache, radix-tree replacement is very atomic 4582 * and we can skip this check. When it was an Anon page, its mapcount 4583 * goes down to 0. But because we added MIGRATION flage, it's not 4584 * uncharged yet. There are several case but page->mapcount check 4585 * and USED bit check in mem_cgroup_uncharge_page() will do enough 4586 * check. (see prepare_charge() also) 4587 */ 4588 if (anon) 4589 mem_cgroup_uncharge_page(used); 4590 } 4591 4592 /* 4593 * At replace page cache, newpage is not under any memcg but it's on 4594 * LRU. So, this function doesn't touch res_counter but handles LRU 4595 * in correct way. Both pages are locked so we cannot race with uncharge. 4596 */ 4597 void mem_cgroup_replace_page_cache(struct page *oldpage, 4598 struct page *newpage) 4599 { 4600 struct mem_cgroup *memcg = NULL; 4601 struct page_cgroup *pc; 4602 enum charge_type type = MEM_CGROUP_CHARGE_TYPE_CACHE; 4603 4604 if (mem_cgroup_disabled()) 4605 return; 4606 4607 pc = lookup_page_cgroup(oldpage); 4608 /* fix accounting on old pages */ 4609 lock_page_cgroup(pc); 4610 if (PageCgroupUsed(pc)) { 4611 memcg = pc->mem_cgroup; 4612 mem_cgroup_charge_statistics(memcg, oldpage, false, -1); 4613 ClearPageCgroupUsed(pc); 4614 } 4615 unlock_page_cgroup(pc); 4616 4617 /* 4618 * When called from shmem_replace_page(), in some cases the 4619 * oldpage has already been charged, and in some cases not. 4620 */ 4621 if (!memcg) 4622 return; 4623 /* 4624 * Even if newpage->mapping was NULL before starting replacement, 4625 * the newpage may be on LRU(or pagevec for LRU) already. We lock 4626 * LRU while we overwrite pc->mem_cgroup. 4627 */ 4628 __mem_cgroup_commit_charge(memcg, newpage, 1, type, true); 4629 } 4630 4631 #ifdef CONFIG_DEBUG_VM 4632 static struct page_cgroup *lookup_page_cgroup_used(struct page *page) 4633 { 4634 struct page_cgroup *pc; 4635 4636 pc = lookup_page_cgroup(page); 4637 /* 4638 * Can be NULL while feeding pages into the page allocator for 4639 * the first time, i.e. during boot or memory hotplug; 4640 * or when mem_cgroup_disabled(). 4641 */ 4642 if (likely(pc) && PageCgroupUsed(pc)) 4643 return pc; 4644 return NULL; 4645 } 4646 4647 bool mem_cgroup_bad_page_check(struct page *page) 4648 { 4649 if (mem_cgroup_disabled()) 4650 return false; 4651 4652 return lookup_page_cgroup_used(page) != NULL; 4653 } 4654 4655 void mem_cgroup_print_bad_page(struct page *page) 4656 { 4657 struct page_cgroup *pc; 4658 4659 pc = lookup_page_cgroup_used(page); 4660 if (pc) { 4661 pr_alert("pc:%p pc->flags:%lx pc->mem_cgroup:%p\n", 4662 pc, pc->flags, pc->mem_cgroup); 4663 } 4664 } 4665 #endif 4666 4667 static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, 4668 unsigned long long val) 4669 { 4670 int retry_count; 4671 u64 memswlimit, memlimit; 4672 int ret = 0; 4673 int children = mem_cgroup_count_children(memcg); 4674 u64 curusage, oldusage; 4675 int enlarge; 4676 4677 /* 4678 * For keeping hierarchical_reclaim simple, how long we should retry 4679 * is depends on callers. We set our retry-count to be function 4680 * of # of children which we should visit in this loop. 4681 */ 4682 retry_count = MEM_CGROUP_RECLAIM_RETRIES * children; 4683 4684 oldusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4685 4686 enlarge = 0; 4687 while (retry_count) { 4688 if (signal_pending(current)) { 4689 ret = -EINTR; 4690 break; 4691 } 4692 /* 4693 * Rather than hide all in some function, I do this in 4694 * open coded manner. You see what this really does. 4695 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4696 */ 4697 mutex_lock(&set_limit_mutex); 4698 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4699 if (memswlimit < val) { 4700 ret = -EINVAL; 4701 mutex_unlock(&set_limit_mutex); 4702 break; 4703 } 4704 4705 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4706 if (memlimit < val) 4707 enlarge = 1; 4708 4709 ret = res_counter_set_limit(&memcg->res, val); 4710 if (!ret) { 4711 if (memswlimit == val) 4712 memcg->memsw_is_minimum = true; 4713 else 4714 memcg->memsw_is_minimum = false; 4715 } 4716 mutex_unlock(&set_limit_mutex); 4717 4718 if (!ret) 4719 break; 4720 4721 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4722 MEM_CGROUP_RECLAIM_SHRINK); 4723 curusage = res_counter_read_u64(&memcg->res, RES_USAGE); 4724 /* Usage is reduced ? */ 4725 if (curusage >= oldusage) 4726 retry_count--; 4727 else 4728 oldusage = curusage; 4729 } 4730 if (!ret && enlarge) 4731 memcg_oom_recover(memcg); 4732 4733 return ret; 4734 } 4735 4736 static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, 4737 unsigned long long val) 4738 { 4739 int retry_count; 4740 u64 memlimit, memswlimit, oldusage, curusage; 4741 int children = mem_cgroup_count_children(memcg); 4742 int ret = -EBUSY; 4743 int enlarge = 0; 4744 4745 /* see mem_cgroup_resize_res_limit */ 4746 retry_count = children * MEM_CGROUP_RECLAIM_RETRIES; 4747 oldusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4748 while (retry_count) { 4749 if (signal_pending(current)) { 4750 ret = -EINTR; 4751 break; 4752 } 4753 /* 4754 * Rather than hide all in some function, I do this in 4755 * open coded manner. You see what this really does. 4756 * We have to guarantee memcg->res.limit <= memcg->memsw.limit. 4757 */ 4758 mutex_lock(&set_limit_mutex); 4759 memlimit = res_counter_read_u64(&memcg->res, RES_LIMIT); 4760 if (memlimit > val) { 4761 ret = -EINVAL; 4762 mutex_unlock(&set_limit_mutex); 4763 break; 4764 } 4765 memswlimit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 4766 if (memswlimit < val) 4767 enlarge = 1; 4768 ret = res_counter_set_limit(&memcg->memsw, val); 4769 if (!ret) { 4770 if (memlimit == val) 4771 memcg->memsw_is_minimum = true; 4772 else 4773 memcg->memsw_is_minimum = false; 4774 } 4775 mutex_unlock(&set_limit_mutex); 4776 4777 if (!ret) 4778 break; 4779 4780 mem_cgroup_reclaim(memcg, GFP_KERNEL, 4781 MEM_CGROUP_RECLAIM_NOSWAP | 4782 MEM_CGROUP_RECLAIM_SHRINK); 4783 curusage = res_counter_read_u64(&memcg->memsw, RES_USAGE); 4784 /* Usage is reduced ? */ 4785 if (curusage >= oldusage) 4786 retry_count--; 4787 else 4788 oldusage = curusage; 4789 } 4790 if (!ret && enlarge) 4791 memcg_oom_recover(memcg); 4792 return ret; 4793 } 4794 4795 unsigned long mem_cgroup_soft_limit_reclaim(struct zone *zone, int order, 4796 gfp_t gfp_mask, 4797 unsigned long *total_scanned) 4798 { 4799 unsigned long nr_reclaimed = 0; 4800 struct mem_cgroup_per_zone *mz, *next_mz = NULL; 4801 unsigned long reclaimed; 4802 int loop = 0; 4803 struct mem_cgroup_tree_per_zone *mctz; 4804 unsigned long long excess; 4805 unsigned long nr_scanned; 4806 4807 if (order > 0) 4808 return 0; 4809 4810 mctz = soft_limit_tree_node_zone(zone_to_nid(zone), zone_idx(zone)); 4811 /* 4812 * This loop can run a while, specially if mem_cgroup's continuously 4813 * keep exceeding their soft limit and putting the system under 4814 * pressure 4815 */ 4816 do { 4817 if (next_mz) 4818 mz = next_mz; 4819 else 4820 mz = mem_cgroup_largest_soft_limit_node(mctz); 4821 if (!mz) 4822 break; 4823 4824 nr_scanned = 0; 4825 reclaimed = mem_cgroup_soft_reclaim(mz->memcg, zone, 4826 gfp_mask, &nr_scanned); 4827 nr_reclaimed += reclaimed; 4828 *total_scanned += nr_scanned; 4829 spin_lock(&mctz->lock); 4830 4831 /* 4832 * If we failed to reclaim anything from this memory cgroup 4833 * it is time to move on to the next cgroup 4834 */ 4835 next_mz = NULL; 4836 if (!reclaimed) { 4837 do { 4838 /* 4839 * Loop until we find yet another one. 4840 * 4841 * By the time we get the soft_limit lock 4842 * again, someone might have aded the 4843 * group back on the RB tree. Iterate to 4844 * make sure we get a different mem. 4845 * mem_cgroup_largest_soft_limit_node returns 4846 * NULL if no other cgroup is present on 4847 * the tree 4848 */ 4849 next_mz = 4850 __mem_cgroup_largest_soft_limit_node(mctz); 4851 if (next_mz == mz) 4852 css_put(&next_mz->memcg->css); 4853 else /* next_mz == NULL or other memcg */ 4854 break; 4855 } while (1); 4856 } 4857 __mem_cgroup_remove_exceeded(mz->memcg, mz, mctz); 4858 excess = res_counter_soft_limit_excess(&mz->memcg->res); 4859 /* 4860 * One school of thought says that we should not add 4861 * back the node to the tree if reclaim returns 0. 4862 * But our reclaim could return 0, simply because due 4863 * to priority we are exposing a smaller subset of 4864 * memory to reclaim from. Consider this as a longer 4865 * term TODO. 4866 */ 4867 /* If excess == 0, no tree ops */ 4868 __mem_cgroup_insert_exceeded(mz->memcg, mz, mctz, excess); 4869 spin_unlock(&mctz->lock); 4870 css_put(&mz->memcg->css); 4871 loop++; 4872 /* 4873 * Could not reclaim anything and there are no more 4874 * mem cgroups to try or we seem to be looping without 4875 * reclaiming anything. 4876 */ 4877 if (!nr_reclaimed && 4878 (next_mz == NULL || 4879 loop > MEM_CGROUP_MAX_SOFT_LIMIT_RECLAIM_LOOPS)) 4880 break; 4881 } while (!nr_reclaimed); 4882 if (next_mz) 4883 css_put(&next_mz->memcg->css); 4884 return nr_reclaimed; 4885 } 4886 4887 /** 4888 * mem_cgroup_force_empty_list - clears LRU of a group 4889 * @memcg: group to clear 4890 * @node: NUMA node 4891 * @zid: zone id 4892 * @lru: lru to to clear 4893 * 4894 * Traverse a specified page_cgroup list and try to drop them all. This doesn't 4895 * reclaim the pages page themselves - pages are moved to the parent (or root) 4896 * group. 4897 */ 4898 static void mem_cgroup_force_empty_list(struct mem_cgroup *memcg, 4899 int node, int zid, enum lru_list lru) 4900 { 4901 struct lruvec *lruvec; 4902 unsigned long flags; 4903 struct list_head *list; 4904 struct page *busy; 4905 struct zone *zone; 4906 4907 zone = &NODE_DATA(node)->node_zones[zid]; 4908 lruvec = mem_cgroup_zone_lruvec(zone, memcg); 4909 list = &lruvec->lists[lru]; 4910 4911 busy = NULL; 4912 do { 4913 struct page_cgroup *pc; 4914 struct page *page; 4915 4916 spin_lock_irqsave(&zone->lru_lock, flags); 4917 if (list_empty(list)) { 4918 spin_unlock_irqrestore(&zone->lru_lock, flags); 4919 break; 4920 } 4921 page = list_entry(list->prev, struct page, lru); 4922 if (busy == page) { 4923 list_move(&page->lru, list); 4924 busy = NULL; 4925 spin_unlock_irqrestore(&zone->lru_lock, flags); 4926 continue; 4927 } 4928 spin_unlock_irqrestore(&zone->lru_lock, flags); 4929 4930 pc = lookup_page_cgroup(page); 4931 4932 if (mem_cgroup_move_parent(page, pc, memcg)) { 4933 /* found lock contention or "pc" is obsolete. */ 4934 busy = page; 4935 cond_resched(); 4936 } else 4937 busy = NULL; 4938 } while (!list_empty(list)); 4939 } 4940 4941 /* 4942 * make mem_cgroup's charge to be 0 if there is no task by moving 4943 * all the charges and pages to the parent. 4944 * This enables deleting this mem_cgroup. 4945 * 4946 * Caller is responsible for holding css reference on the memcg. 4947 */ 4948 static void mem_cgroup_reparent_charges(struct mem_cgroup *memcg) 4949 { 4950 int node, zid; 4951 u64 usage; 4952 4953 do { 4954 /* This is for making all *used* pages to be on LRU. */ 4955 lru_add_drain_all(); 4956 drain_all_stock_sync(memcg); 4957 mem_cgroup_start_move(memcg); 4958 for_each_node_state(node, N_MEMORY) { 4959 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 4960 enum lru_list lru; 4961 for_each_lru(lru) { 4962 mem_cgroup_force_empty_list(memcg, 4963 node, zid, lru); 4964 } 4965 } 4966 } 4967 mem_cgroup_end_move(memcg); 4968 memcg_oom_recover(memcg); 4969 cond_resched(); 4970 4971 /* 4972 * Kernel memory may not necessarily be trackable to a specific 4973 * process. So they are not migrated, and therefore we can't 4974 * expect their value to drop to 0 here. 4975 * Having res filled up with kmem only is enough. 4976 * 4977 * This is a safety check because mem_cgroup_force_empty_list 4978 * could have raced with mem_cgroup_replace_page_cache callers 4979 * so the lru seemed empty but the page could have been added 4980 * right after the check. RES_USAGE should be safe as we always 4981 * charge before adding to the LRU. 4982 */ 4983 usage = res_counter_read_u64(&memcg->res, RES_USAGE) - 4984 res_counter_read_u64(&memcg->kmem, RES_USAGE); 4985 } while (usage > 0); 4986 } 4987 4988 static inline bool memcg_has_children(struct mem_cgroup *memcg) 4989 { 4990 lockdep_assert_held(&memcg_create_mutex); 4991 /* 4992 * The lock does not prevent addition or deletion to the list 4993 * of children, but it prevents a new child from being 4994 * initialized based on this parent in css_online(), so it's 4995 * enough to decide whether hierarchically inherited 4996 * attributes can still be changed or not. 4997 */ 4998 return memcg->use_hierarchy && 4999 !list_empty(&memcg->css.cgroup->children); 5000 } 5001 5002 /* 5003 * Reclaims as many pages from the given memcg as possible and moves 5004 * the rest to the parent. 5005 * 5006 * Caller is responsible for holding css reference for memcg. 5007 */ 5008 static int mem_cgroup_force_empty(struct mem_cgroup *memcg) 5009 { 5010 int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; 5011 struct cgroup *cgrp = memcg->css.cgroup; 5012 5013 /* returns EBUSY if there is a task or if we come here twice. */ 5014 if (cgroup_task_count(cgrp) || !list_empty(&cgrp->children)) 5015 return -EBUSY; 5016 5017 /* we call try-to-free pages for make this cgroup empty */ 5018 lru_add_drain_all(); 5019 /* try to free all pages in this cgroup */ 5020 while (nr_retries && res_counter_read_u64(&memcg->res, RES_USAGE) > 0) { 5021 int progress; 5022 5023 if (signal_pending(current)) 5024 return -EINTR; 5025 5026 progress = try_to_free_mem_cgroup_pages(memcg, GFP_KERNEL, 5027 false); 5028 if (!progress) { 5029 nr_retries--; 5030 /* maybe some writeback is necessary */ 5031 congestion_wait(BLK_RW_ASYNC, HZ/10); 5032 } 5033 5034 } 5035 lru_add_drain(); 5036 mem_cgroup_reparent_charges(memcg); 5037 5038 return 0; 5039 } 5040 5041 static int mem_cgroup_force_empty_write(struct cgroup_subsys_state *css, 5042 unsigned int event) 5043 { 5044 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5045 5046 if (mem_cgroup_is_root(memcg)) 5047 return -EINVAL; 5048 return mem_cgroup_force_empty(memcg); 5049 } 5050 5051 static u64 mem_cgroup_hierarchy_read(struct cgroup_subsys_state *css, 5052 struct cftype *cft) 5053 { 5054 return mem_cgroup_from_css(css)->use_hierarchy; 5055 } 5056 5057 static int mem_cgroup_hierarchy_write(struct cgroup_subsys_state *css, 5058 struct cftype *cft, u64 val) 5059 { 5060 int retval = 0; 5061 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5062 struct mem_cgroup *parent_memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 5063 5064 mutex_lock(&memcg_create_mutex); 5065 5066 if (memcg->use_hierarchy == val) 5067 goto out; 5068 5069 /* 5070 * If parent's use_hierarchy is set, we can't make any modifications 5071 * in the child subtrees. If it is unset, then the change can 5072 * occur, provided the current cgroup has no children. 5073 * 5074 * For the root cgroup, parent_mem is NULL, we allow value to be 5075 * set if there are no children. 5076 */ 5077 if ((!parent_memcg || !parent_memcg->use_hierarchy) && 5078 (val == 1 || val == 0)) { 5079 if (list_empty(&memcg->css.cgroup->children)) 5080 memcg->use_hierarchy = val; 5081 else 5082 retval = -EBUSY; 5083 } else 5084 retval = -EINVAL; 5085 5086 out: 5087 mutex_unlock(&memcg_create_mutex); 5088 5089 return retval; 5090 } 5091 5092 5093 static unsigned long mem_cgroup_recursive_stat(struct mem_cgroup *memcg, 5094 enum mem_cgroup_stat_index idx) 5095 { 5096 struct mem_cgroup *iter; 5097 long val = 0; 5098 5099 /* Per-cpu values can be negative, use a signed accumulator */ 5100 for_each_mem_cgroup_tree(iter, memcg) 5101 val += mem_cgroup_read_stat(iter, idx); 5102 5103 if (val < 0) /* race ? */ 5104 val = 0; 5105 return val; 5106 } 5107 5108 static inline u64 mem_cgroup_usage(struct mem_cgroup *memcg, bool swap) 5109 { 5110 u64 val; 5111 5112 if (!mem_cgroup_is_root(memcg)) { 5113 if (!swap) 5114 return res_counter_read_u64(&memcg->res, RES_USAGE); 5115 else 5116 return res_counter_read_u64(&memcg->memsw, RES_USAGE); 5117 } 5118 5119 /* 5120 * Transparent hugepages are still accounted for in MEM_CGROUP_STAT_RSS 5121 * as well as in MEM_CGROUP_STAT_RSS_HUGE. 5122 */ 5123 val = mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_CACHE); 5124 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_RSS); 5125 5126 if (swap) 5127 val += mem_cgroup_recursive_stat(memcg, MEM_CGROUP_STAT_SWAP); 5128 5129 return val << PAGE_SHIFT; 5130 } 5131 5132 static ssize_t mem_cgroup_read(struct cgroup_subsys_state *css, 5133 struct cftype *cft, struct file *file, 5134 char __user *buf, size_t nbytes, loff_t *ppos) 5135 { 5136 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5137 char str[64]; 5138 u64 val; 5139 int name, len; 5140 enum res_type type; 5141 5142 type = MEMFILE_TYPE(cft->private); 5143 name = MEMFILE_ATTR(cft->private); 5144 5145 switch (type) { 5146 case _MEM: 5147 if (name == RES_USAGE) 5148 val = mem_cgroup_usage(memcg, false); 5149 else 5150 val = res_counter_read_u64(&memcg->res, name); 5151 break; 5152 case _MEMSWAP: 5153 if (name == RES_USAGE) 5154 val = mem_cgroup_usage(memcg, true); 5155 else 5156 val = res_counter_read_u64(&memcg->memsw, name); 5157 break; 5158 case _KMEM: 5159 val = res_counter_read_u64(&memcg->kmem, name); 5160 break; 5161 default: 5162 BUG(); 5163 } 5164 5165 len = scnprintf(str, sizeof(str), "%llu\n", (unsigned long long)val); 5166 return simple_read_from_buffer(buf, nbytes, ppos, str, len); 5167 } 5168 5169 static int memcg_update_kmem_limit(struct cgroup_subsys_state *css, u64 val) 5170 { 5171 int ret = -EINVAL; 5172 #ifdef CONFIG_MEMCG_KMEM 5173 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5174 /* 5175 * For simplicity, we won't allow this to be disabled. It also can't 5176 * be changed if the cgroup has children already, or if tasks had 5177 * already joined. 5178 * 5179 * If tasks join before we set the limit, a person looking at 5180 * kmem.usage_in_bytes will have no way to determine when it took 5181 * place, which makes the value quite meaningless. 5182 * 5183 * After it first became limited, changes in the value of the limit are 5184 * of course permitted. 5185 */ 5186 mutex_lock(&memcg_create_mutex); 5187 mutex_lock(&set_limit_mutex); 5188 if (!memcg->kmem_account_flags && val != RES_COUNTER_MAX) { 5189 if (cgroup_task_count(css->cgroup) || memcg_has_children(memcg)) { 5190 ret = -EBUSY; 5191 goto out; 5192 } 5193 ret = res_counter_set_limit(&memcg->kmem, val); 5194 VM_BUG_ON(ret); 5195 5196 ret = memcg_update_cache_sizes(memcg); 5197 if (ret) { 5198 res_counter_set_limit(&memcg->kmem, RES_COUNTER_MAX); 5199 goto out; 5200 } 5201 static_key_slow_inc(&memcg_kmem_enabled_key); 5202 /* 5203 * setting the active bit after the inc will guarantee no one 5204 * starts accounting before all call sites are patched 5205 */ 5206 memcg_kmem_set_active(memcg); 5207 } else 5208 ret = res_counter_set_limit(&memcg->kmem, val); 5209 out: 5210 mutex_unlock(&set_limit_mutex); 5211 mutex_unlock(&memcg_create_mutex); 5212 #endif 5213 return ret; 5214 } 5215 5216 #ifdef CONFIG_MEMCG_KMEM 5217 static int memcg_propagate_kmem(struct mem_cgroup *memcg) 5218 { 5219 int ret = 0; 5220 struct mem_cgroup *parent = parent_mem_cgroup(memcg); 5221 if (!parent) 5222 goto out; 5223 5224 memcg->kmem_account_flags = parent->kmem_account_flags; 5225 /* 5226 * When that happen, we need to disable the static branch only on those 5227 * memcgs that enabled it. To achieve this, we would be forced to 5228 * complicate the code by keeping track of which memcgs were the ones 5229 * that actually enabled limits, and which ones got it from its 5230 * parents. 5231 * 5232 * It is a lot simpler just to do static_key_slow_inc() on every child 5233 * that is accounted. 5234 */ 5235 if (!memcg_kmem_is_active(memcg)) 5236 goto out; 5237 5238 /* 5239 * __mem_cgroup_free() will issue static_key_slow_dec() because this 5240 * memcg is active already. If the later initialization fails then the 5241 * cgroup core triggers the cleanup so we do not have to do it here. 5242 */ 5243 static_key_slow_inc(&memcg_kmem_enabled_key); 5244 5245 mutex_lock(&set_limit_mutex); 5246 memcg_stop_kmem_account(); 5247 ret = memcg_update_cache_sizes(memcg); 5248 memcg_resume_kmem_account(); 5249 mutex_unlock(&set_limit_mutex); 5250 out: 5251 return ret; 5252 } 5253 #endif /* CONFIG_MEMCG_KMEM */ 5254 5255 /* 5256 * The user of this function is... 5257 * RES_LIMIT. 5258 */ 5259 static int mem_cgroup_write(struct cgroup_subsys_state *css, struct cftype *cft, 5260 const char *buffer) 5261 { 5262 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5263 enum res_type type; 5264 int name; 5265 unsigned long long val; 5266 int ret; 5267 5268 type = MEMFILE_TYPE(cft->private); 5269 name = MEMFILE_ATTR(cft->private); 5270 5271 switch (name) { 5272 case RES_LIMIT: 5273 if (mem_cgroup_is_root(memcg)) { /* Can't set limit on root */ 5274 ret = -EINVAL; 5275 break; 5276 } 5277 /* This function does all necessary parse...reuse it */ 5278 ret = res_counter_memparse_write_strategy(buffer, &val); 5279 if (ret) 5280 break; 5281 if (type == _MEM) 5282 ret = mem_cgroup_resize_limit(memcg, val); 5283 else if (type == _MEMSWAP) 5284 ret = mem_cgroup_resize_memsw_limit(memcg, val); 5285 else if (type == _KMEM) 5286 ret = memcg_update_kmem_limit(css, val); 5287 else 5288 return -EINVAL; 5289 break; 5290 case RES_SOFT_LIMIT: 5291 ret = res_counter_memparse_write_strategy(buffer, &val); 5292 if (ret) 5293 break; 5294 /* 5295 * For memsw, soft limits are hard to implement in terms 5296 * of semantics, for now, we support soft limits for 5297 * control without swap 5298 */ 5299 if (type == _MEM) 5300 ret = res_counter_set_soft_limit(&memcg->res, val); 5301 else 5302 ret = -EINVAL; 5303 break; 5304 default: 5305 ret = -EINVAL; /* should be BUG() ? */ 5306 break; 5307 } 5308 return ret; 5309 } 5310 5311 static void memcg_get_hierarchical_limit(struct mem_cgroup *memcg, 5312 unsigned long long *mem_limit, unsigned long long *memsw_limit) 5313 { 5314 unsigned long long min_limit, min_memsw_limit, tmp; 5315 5316 min_limit = res_counter_read_u64(&memcg->res, RES_LIMIT); 5317 min_memsw_limit = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5318 if (!memcg->use_hierarchy) 5319 goto out; 5320 5321 while (css_parent(&memcg->css)) { 5322 memcg = mem_cgroup_from_css(css_parent(&memcg->css)); 5323 if (!memcg->use_hierarchy) 5324 break; 5325 tmp = res_counter_read_u64(&memcg->res, RES_LIMIT); 5326 min_limit = min(min_limit, tmp); 5327 tmp = res_counter_read_u64(&memcg->memsw, RES_LIMIT); 5328 min_memsw_limit = min(min_memsw_limit, tmp); 5329 } 5330 out: 5331 *mem_limit = min_limit; 5332 *memsw_limit = min_memsw_limit; 5333 } 5334 5335 static int mem_cgroup_reset(struct cgroup_subsys_state *css, unsigned int event) 5336 { 5337 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5338 int name; 5339 enum res_type type; 5340 5341 type = MEMFILE_TYPE(event); 5342 name = MEMFILE_ATTR(event); 5343 5344 switch (name) { 5345 case RES_MAX_USAGE: 5346 if (type == _MEM) 5347 res_counter_reset_max(&memcg->res); 5348 else if (type == _MEMSWAP) 5349 res_counter_reset_max(&memcg->memsw); 5350 else if (type == _KMEM) 5351 res_counter_reset_max(&memcg->kmem); 5352 else 5353 return -EINVAL; 5354 break; 5355 case RES_FAILCNT: 5356 if (type == _MEM) 5357 res_counter_reset_failcnt(&memcg->res); 5358 else if (type == _MEMSWAP) 5359 res_counter_reset_failcnt(&memcg->memsw); 5360 else if (type == _KMEM) 5361 res_counter_reset_failcnt(&memcg->kmem); 5362 else 5363 return -EINVAL; 5364 break; 5365 } 5366 5367 return 0; 5368 } 5369 5370 static u64 mem_cgroup_move_charge_read(struct cgroup_subsys_state *css, 5371 struct cftype *cft) 5372 { 5373 return mem_cgroup_from_css(css)->move_charge_at_immigrate; 5374 } 5375 5376 #ifdef CONFIG_MMU 5377 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 5378 struct cftype *cft, u64 val) 5379 { 5380 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5381 5382 if (val >= (1 << NR_MOVE_TYPE)) 5383 return -EINVAL; 5384 5385 /* 5386 * No kind of locking is needed in here, because ->can_attach() will 5387 * check this value once in the beginning of the process, and then carry 5388 * on with stale data. This means that changes to this value will only 5389 * affect task migrations starting after the change. 5390 */ 5391 memcg->move_charge_at_immigrate = val; 5392 return 0; 5393 } 5394 #else 5395 static int mem_cgroup_move_charge_write(struct cgroup_subsys_state *css, 5396 struct cftype *cft, u64 val) 5397 { 5398 return -ENOSYS; 5399 } 5400 #endif 5401 5402 #ifdef CONFIG_NUMA 5403 static int memcg_numa_stat_show(struct cgroup_subsys_state *css, 5404 struct cftype *cft, struct seq_file *m) 5405 { 5406 struct numa_stat { 5407 const char *name; 5408 unsigned int lru_mask; 5409 }; 5410 5411 static const struct numa_stat stats[] = { 5412 { "total", LRU_ALL }, 5413 { "file", LRU_ALL_FILE }, 5414 { "anon", LRU_ALL_ANON }, 5415 { "unevictable", BIT(LRU_UNEVICTABLE) }, 5416 }; 5417 const struct numa_stat *stat; 5418 int nid; 5419 unsigned long nr; 5420 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5421 5422 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5423 nr = mem_cgroup_nr_lru_pages(memcg, stat->lru_mask); 5424 seq_printf(m, "%s=%lu", stat->name, nr); 5425 for_each_node_state(nid, N_MEMORY) { 5426 nr = mem_cgroup_node_nr_lru_pages(memcg, nid, 5427 stat->lru_mask); 5428 seq_printf(m, " N%d=%lu", nid, nr); 5429 } 5430 seq_putc(m, '\n'); 5431 } 5432 5433 for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) { 5434 struct mem_cgroup *iter; 5435 5436 nr = 0; 5437 for_each_mem_cgroup_tree(iter, memcg) 5438 nr += mem_cgroup_nr_lru_pages(iter, stat->lru_mask); 5439 seq_printf(m, "hierarchical_%s=%lu", stat->name, nr); 5440 for_each_node_state(nid, N_MEMORY) { 5441 nr = 0; 5442 for_each_mem_cgroup_tree(iter, memcg) 5443 nr += mem_cgroup_node_nr_lru_pages( 5444 iter, nid, stat->lru_mask); 5445 seq_printf(m, " N%d=%lu", nid, nr); 5446 } 5447 seq_putc(m, '\n'); 5448 } 5449 5450 return 0; 5451 } 5452 #endif /* CONFIG_NUMA */ 5453 5454 static inline void mem_cgroup_lru_names_not_uptodate(void) 5455 { 5456 BUILD_BUG_ON(ARRAY_SIZE(mem_cgroup_lru_names) != NR_LRU_LISTS); 5457 } 5458 5459 static int memcg_stat_show(struct cgroup_subsys_state *css, struct cftype *cft, 5460 struct seq_file *m) 5461 { 5462 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5463 struct mem_cgroup *mi; 5464 unsigned int i; 5465 5466 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5467 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5468 continue; 5469 seq_printf(m, "%s %ld\n", mem_cgroup_stat_names[i], 5470 mem_cgroup_read_stat(memcg, i) * PAGE_SIZE); 5471 } 5472 5473 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) 5474 seq_printf(m, "%s %lu\n", mem_cgroup_events_names[i], 5475 mem_cgroup_read_events(memcg, i)); 5476 5477 for (i = 0; i < NR_LRU_LISTS; i++) 5478 seq_printf(m, "%s %lu\n", mem_cgroup_lru_names[i], 5479 mem_cgroup_nr_lru_pages(memcg, BIT(i)) * PAGE_SIZE); 5480 5481 /* Hierarchical information */ 5482 { 5483 unsigned long long limit, memsw_limit; 5484 memcg_get_hierarchical_limit(memcg, &limit, &memsw_limit); 5485 seq_printf(m, "hierarchical_memory_limit %llu\n", limit); 5486 if (do_swap_account) 5487 seq_printf(m, "hierarchical_memsw_limit %llu\n", 5488 memsw_limit); 5489 } 5490 5491 for (i = 0; i < MEM_CGROUP_STAT_NSTATS; i++) { 5492 long long val = 0; 5493 5494 if (i == MEM_CGROUP_STAT_SWAP && !do_swap_account) 5495 continue; 5496 for_each_mem_cgroup_tree(mi, memcg) 5497 val += mem_cgroup_read_stat(mi, i) * PAGE_SIZE; 5498 seq_printf(m, "total_%s %lld\n", mem_cgroup_stat_names[i], val); 5499 } 5500 5501 for (i = 0; i < MEM_CGROUP_EVENTS_NSTATS; i++) { 5502 unsigned long long val = 0; 5503 5504 for_each_mem_cgroup_tree(mi, memcg) 5505 val += mem_cgroup_read_events(mi, i); 5506 seq_printf(m, "total_%s %llu\n", 5507 mem_cgroup_events_names[i], val); 5508 } 5509 5510 for (i = 0; i < NR_LRU_LISTS; i++) { 5511 unsigned long long val = 0; 5512 5513 for_each_mem_cgroup_tree(mi, memcg) 5514 val += mem_cgroup_nr_lru_pages(mi, BIT(i)) * PAGE_SIZE; 5515 seq_printf(m, "total_%s %llu\n", mem_cgroup_lru_names[i], val); 5516 } 5517 5518 #ifdef CONFIG_DEBUG_VM 5519 { 5520 int nid, zid; 5521 struct mem_cgroup_per_zone *mz; 5522 struct zone_reclaim_stat *rstat; 5523 unsigned long recent_rotated[2] = {0, 0}; 5524 unsigned long recent_scanned[2] = {0, 0}; 5525 5526 for_each_online_node(nid) 5527 for (zid = 0; zid < MAX_NR_ZONES; zid++) { 5528 mz = mem_cgroup_zoneinfo(memcg, nid, zid); 5529 rstat = &mz->lruvec.reclaim_stat; 5530 5531 recent_rotated[0] += rstat->recent_rotated[0]; 5532 recent_rotated[1] += rstat->recent_rotated[1]; 5533 recent_scanned[0] += rstat->recent_scanned[0]; 5534 recent_scanned[1] += rstat->recent_scanned[1]; 5535 } 5536 seq_printf(m, "recent_rotated_anon %lu\n", recent_rotated[0]); 5537 seq_printf(m, "recent_rotated_file %lu\n", recent_rotated[1]); 5538 seq_printf(m, "recent_scanned_anon %lu\n", recent_scanned[0]); 5539 seq_printf(m, "recent_scanned_file %lu\n", recent_scanned[1]); 5540 } 5541 #endif 5542 5543 return 0; 5544 } 5545 5546 static u64 mem_cgroup_swappiness_read(struct cgroup_subsys_state *css, 5547 struct cftype *cft) 5548 { 5549 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5550 5551 return mem_cgroup_swappiness(memcg); 5552 } 5553 5554 static int mem_cgroup_swappiness_write(struct cgroup_subsys_state *css, 5555 struct cftype *cft, u64 val) 5556 { 5557 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5558 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); 5559 5560 if (val > 100 || !parent) 5561 return -EINVAL; 5562 5563 mutex_lock(&memcg_create_mutex); 5564 5565 /* If under hierarchy, only empty-root can set this value */ 5566 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5567 mutex_unlock(&memcg_create_mutex); 5568 return -EINVAL; 5569 } 5570 5571 memcg->swappiness = val; 5572 5573 mutex_unlock(&memcg_create_mutex); 5574 5575 return 0; 5576 } 5577 5578 static void __mem_cgroup_threshold(struct mem_cgroup *memcg, bool swap) 5579 { 5580 struct mem_cgroup_threshold_ary *t; 5581 u64 usage; 5582 int i; 5583 5584 rcu_read_lock(); 5585 if (!swap) 5586 t = rcu_dereference(memcg->thresholds.primary); 5587 else 5588 t = rcu_dereference(memcg->memsw_thresholds.primary); 5589 5590 if (!t) 5591 goto unlock; 5592 5593 usage = mem_cgroup_usage(memcg, swap); 5594 5595 /* 5596 * current_threshold points to threshold just below or equal to usage. 5597 * If it's not true, a threshold was crossed after last 5598 * call of __mem_cgroup_threshold(). 5599 */ 5600 i = t->current_threshold; 5601 5602 /* 5603 * Iterate backward over array of thresholds starting from 5604 * current_threshold and check if a threshold is crossed. 5605 * If none of thresholds below usage is crossed, we read 5606 * only one element of the array here. 5607 */ 5608 for (; i >= 0 && unlikely(t->entries[i].threshold > usage); i--) 5609 eventfd_signal(t->entries[i].eventfd, 1); 5610 5611 /* i = current_threshold + 1 */ 5612 i++; 5613 5614 /* 5615 * Iterate forward over array of thresholds starting from 5616 * current_threshold+1 and check if a threshold is crossed. 5617 * If none of thresholds above usage is crossed, we read 5618 * only one element of the array here. 5619 */ 5620 for (; i < t->size && unlikely(t->entries[i].threshold <= usage); i++) 5621 eventfd_signal(t->entries[i].eventfd, 1); 5622 5623 /* Update current_threshold */ 5624 t->current_threshold = i - 1; 5625 unlock: 5626 rcu_read_unlock(); 5627 } 5628 5629 static void mem_cgroup_threshold(struct mem_cgroup *memcg) 5630 { 5631 while (memcg) { 5632 __mem_cgroup_threshold(memcg, false); 5633 if (do_swap_account) 5634 __mem_cgroup_threshold(memcg, true); 5635 5636 memcg = parent_mem_cgroup(memcg); 5637 } 5638 } 5639 5640 static int compare_thresholds(const void *a, const void *b) 5641 { 5642 const struct mem_cgroup_threshold *_a = a; 5643 const struct mem_cgroup_threshold *_b = b; 5644 5645 if (_a->threshold > _b->threshold) 5646 return 1; 5647 5648 if (_a->threshold < _b->threshold) 5649 return -1; 5650 5651 return 0; 5652 } 5653 5654 static int mem_cgroup_oom_notify_cb(struct mem_cgroup *memcg) 5655 { 5656 struct mem_cgroup_eventfd_list *ev; 5657 5658 list_for_each_entry(ev, &memcg->oom_notify, list) 5659 eventfd_signal(ev->eventfd, 1); 5660 return 0; 5661 } 5662 5663 static void mem_cgroup_oom_notify(struct mem_cgroup *memcg) 5664 { 5665 struct mem_cgroup *iter; 5666 5667 for_each_mem_cgroup_tree(iter, memcg) 5668 mem_cgroup_oom_notify_cb(iter); 5669 } 5670 5671 static int mem_cgroup_usage_register_event(struct cgroup_subsys_state *css, 5672 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5673 { 5674 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5675 struct mem_cgroup_thresholds *thresholds; 5676 struct mem_cgroup_threshold_ary *new; 5677 enum res_type type = MEMFILE_TYPE(cft->private); 5678 u64 threshold, usage; 5679 int i, size, ret; 5680 5681 ret = res_counter_memparse_write_strategy(args, &threshold); 5682 if (ret) 5683 return ret; 5684 5685 mutex_lock(&memcg->thresholds_lock); 5686 5687 if (type == _MEM) 5688 thresholds = &memcg->thresholds; 5689 else if (type == _MEMSWAP) 5690 thresholds = &memcg->memsw_thresholds; 5691 else 5692 BUG(); 5693 5694 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5695 5696 /* Check if a threshold crossed before adding a new one */ 5697 if (thresholds->primary) 5698 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5699 5700 size = thresholds->primary ? thresholds->primary->size + 1 : 1; 5701 5702 /* Allocate memory for new array of thresholds */ 5703 new = kmalloc(sizeof(*new) + size * sizeof(struct mem_cgroup_threshold), 5704 GFP_KERNEL); 5705 if (!new) { 5706 ret = -ENOMEM; 5707 goto unlock; 5708 } 5709 new->size = size; 5710 5711 /* Copy thresholds (if any) to new array */ 5712 if (thresholds->primary) { 5713 memcpy(new->entries, thresholds->primary->entries, (size - 1) * 5714 sizeof(struct mem_cgroup_threshold)); 5715 } 5716 5717 /* Add new threshold */ 5718 new->entries[size - 1].eventfd = eventfd; 5719 new->entries[size - 1].threshold = threshold; 5720 5721 /* Sort thresholds. Registering of new threshold isn't time-critical */ 5722 sort(new->entries, size, sizeof(struct mem_cgroup_threshold), 5723 compare_thresholds, NULL); 5724 5725 /* Find current threshold */ 5726 new->current_threshold = -1; 5727 for (i = 0; i < size; i++) { 5728 if (new->entries[i].threshold <= usage) { 5729 /* 5730 * new->current_threshold will not be used until 5731 * rcu_assign_pointer(), so it's safe to increment 5732 * it here. 5733 */ 5734 ++new->current_threshold; 5735 } else 5736 break; 5737 } 5738 5739 /* Free old spare buffer and save old primary buffer as spare */ 5740 kfree(thresholds->spare); 5741 thresholds->spare = thresholds->primary; 5742 5743 rcu_assign_pointer(thresholds->primary, new); 5744 5745 /* To be sure that nobody uses thresholds */ 5746 synchronize_rcu(); 5747 5748 unlock: 5749 mutex_unlock(&memcg->thresholds_lock); 5750 5751 return ret; 5752 } 5753 5754 static void mem_cgroup_usage_unregister_event(struct cgroup_subsys_state *css, 5755 struct cftype *cft, struct eventfd_ctx *eventfd) 5756 { 5757 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5758 struct mem_cgroup_thresholds *thresholds; 5759 struct mem_cgroup_threshold_ary *new; 5760 enum res_type type = MEMFILE_TYPE(cft->private); 5761 u64 usage; 5762 int i, j, size; 5763 5764 mutex_lock(&memcg->thresholds_lock); 5765 if (type == _MEM) 5766 thresholds = &memcg->thresholds; 5767 else if (type == _MEMSWAP) 5768 thresholds = &memcg->memsw_thresholds; 5769 else 5770 BUG(); 5771 5772 if (!thresholds->primary) 5773 goto unlock; 5774 5775 usage = mem_cgroup_usage(memcg, type == _MEMSWAP); 5776 5777 /* Check if a threshold crossed before removing */ 5778 __mem_cgroup_threshold(memcg, type == _MEMSWAP); 5779 5780 /* Calculate new number of threshold */ 5781 size = 0; 5782 for (i = 0; i < thresholds->primary->size; i++) { 5783 if (thresholds->primary->entries[i].eventfd != eventfd) 5784 size++; 5785 } 5786 5787 new = thresholds->spare; 5788 5789 /* Set thresholds array to NULL if we don't have thresholds */ 5790 if (!size) { 5791 kfree(new); 5792 new = NULL; 5793 goto swap_buffers; 5794 } 5795 5796 new->size = size; 5797 5798 /* Copy thresholds and find current threshold */ 5799 new->current_threshold = -1; 5800 for (i = 0, j = 0; i < thresholds->primary->size; i++) { 5801 if (thresholds->primary->entries[i].eventfd == eventfd) 5802 continue; 5803 5804 new->entries[j] = thresholds->primary->entries[i]; 5805 if (new->entries[j].threshold <= usage) { 5806 /* 5807 * new->current_threshold will not be used 5808 * until rcu_assign_pointer(), so it's safe to increment 5809 * it here. 5810 */ 5811 ++new->current_threshold; 5812 } 5813 j++; 5814 } 5815 5816 swap_buffers: 5817 /* Swap primary and spare array */ 5818 thresholds->spare = thresholds->primary; 5819 /* If all events are unregistered, free the spare array */ 5820 if (!new) { 5821 kfree(thresholds->spare); 5822 thresholds->spare = NULL; 5823 } 5824 5825 rcu_assign_pointer(thresholds->primary, new); 5826 5827 /* To be sure that nobody uses thresholds */ 5828 synchronize_rcu(); 5829 unlock: 5830 mutex_unlock(&memcg->thresholds_lock); 5831 } 5832 5833 static int mem_cgroup_oom_register_event(struct cgroup_subsys_state *css, 5834 struct cftype *cft, struct eventfd_ctx *eventfd, const char *args) 5835 { 5836 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5837 struct mem_cgroup_eventfd_list *event; 5838 enum res_type type = MEMFILE_TYPE(cft->private); 5839 5840 BUG_ON(type != _OOM_TYPE); 5841 event = kmalloc(sizeof(*event), GFP_KERNEL); 5842 if (!event) 5843 return -ENOMEM; 5844 5845 spin_lock(&memcg_oom_lock); 5846 5847 event->eventfd = eventfd; 5848 list_add(&event->list, &memcg->oom_notify); 5849 5850 /* already in OOM ? */ 5851 if (atomic_read(&memcg->under_oom)) 5852 eventfd_signal(eventfd, 1); 5853 spin_unlock(&memcg_oom_lock); 5854 5855 return 0; 5856 } 5857 5858 static void mem_cgroup_oom_unregister_event(struct cgroup_subsys_state *css, 5859 struct cftype *cft, struct eventfd_ctx *eventfd) 5860 { 5861 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5862 struct mem_cgroup_eventfd_list *ev, *tmp; 5863 enum res_type type = MEMFILE_TYPE(cft->private); 5864 5865 BUG_ON(type != _OOM_TYPE); 5866 5867 spin_lock(&memcg_oom_lock); 5868 5869 list_for_each_entry_safe(ev, tmp, &memcg->oom_notify, list) { 5870 if (ev->eventfd == eventfd) { 5871 list_del(&ev->list); 5872 kfree(ev); 5873 } 5874 } 5875 5876 spin_unlock(&memcg_oom_lock); 5877 } 5878 5879 static int mem_cgroup_oom_control_read(struct cgroup_subsys_state *css, 5880 struct cftype *cft, struct cgroup_map_cb *cb) 5881 { 5882 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5883 5884 cb->fill(cb, "oom_kill_disable", memcg->oom_kill_disable); 5885 5886 if (atomic_read(&memcg->under_oom)) 5887 cb->fill(cb, "under_oom", 1); 5888 else 5889 cb->fill(cb, "under_oom", 0); 5890 return 0; 5891 } 5892 5893 static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, 5894 struct cftype *cft, u64 val) 5895 { 5896 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 5897 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(&memcg->css)); 5898 5899 /* cannot set to root cgroup and only 0 and 1 are allowed */ 5900 if (!parent || !((val == 0) || (val == 1))) 5901 return -EINVAL; 5902 5903 mutex_lock(&memcg_create_mutex); 5904 /* oom-kill-disable is a flag for subhierarchy. */ 5905 if ((parent->use_hierarchy) || memcg_has_children(memcg)) { 5906 mutex_unlock(&memcg_create_mutex); 5907 return -EINVAL; 5908 } 5909 memcg->oom_kill_disable = val; 5910 if (!val) 5911 memcg_oom_recover(memcg); 5912 mutex_unlock(&memcg_create_mutex); 5913 return 0; 5914 } 5915 5916 #ifdef CONFIG_MEMCG_KMEM 5917 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5918 { 5919 int ret; 5920 5921 memcg->kmemcg_id = -1; 5922 ret = memcg_propagate_kmem(memcg); 5923 if (ret) 5924 return ret; 5925 5926 return mem_cgroup_sockets_init(memcg, ss); 5927 } 5928 5929 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5930 { 5931 mem_cgroup_sockets_destroy(memcg); 5932 } 5933 5934 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5935 { 5936 if (!memcg_kmem_is_active(memcg)) 5937 return; 5938 5939 /* 5940 * kmem charges can outlive the cgroup. In the case of slab 5941 * pages, for instance, a page contain objects from various 5942 * processes. As we prevent from taking a reference for every 5943 * such allocation we have to be careful when doing uncharge 5944 * (see memcg_uncharge_kmem) and here during offlining. 5945 * 5946 * The idea is that that only the _last_ uncharge which sees 5947 * the dead memcg will drop the last reference. An additional 5948 * reference is taken here before the group is marked dead 5949 * which is then paired with css_put during uncharge resp. here. 5950 * 5951 * Although this might sound strange as this path is called from 5952 * css_offline() when the referencemight have dropped down to 0 5953 * and shouldn't be incremented anymore (css_tryget would fail) 5954 * we do not have other options because of the kmem allocations 5955 * lifetime. 5956 */ 5957 css_get(&memcg->css); 5958 5959 memcg_kmem_mark_dead(memcg); 5960 5961 if (res_counter_read_u64(&memcg->kmem, RES_USAGE) != 0) 5962 return; 5963 5964 if (memcg_kmem_test_and_clear_dead(memcg)) 5965 css_put(&memcg->css); 5966 } 5967 #else 5968 static int memcg_init_kmem(struct mem_cgroup *memcg, struct cgroup_subsys *ss) 5969 { 5970 return 0; 5971 } 5972 5973 static void memcg_destroy_kmem(struct mem_cgroup *memcg) 5974 { 5975 } 5976 5977 static void kmem_cgroup_css_offline(struct mem_cgroup *memcg) 5978 { 5979 } 5980 #endif 5981 5982 static struct cftype mem_cgroup_files[] = { 5983 { 5984 .name = "usage_in_bytes", 5985 .private = MEMFILE_PRIVATE(_MEM, RES_USAGE), 5986 .read = mem_cgroup_read, 5987 .register_event = mem_cgroup_usage_register_event, 5988 .unregister_event = mem_cgroup_usage_unregister_event, 5989 }, 5990 { 5991 .name = "max_usage_in_bytes", 5992 .private = MEMFILE_PRIVATE(_MEM, RES_MAX_USAGE), 5993 .trigger = mem_cgroup_reset, 5994 .read = mem_cgroup_read, 5995 }, 5996 { 5997 .name = "limit_in_bytes", 5998 .private = MEMFILE_PRIVATE(_MEM, RES_LIMIT), 5999 .write_string = mem_cgroup_write, 6000 .read = mem_cgroup_read, 6001 }, 6002 { 6003 .name = "soft_limit_in_bytes", 6004 .private = MEMFILE_PRIVATE(_MEM, RES_SOFT_LIMIT), 6005 .write_string = mem_cgroup_write, 6006 .read = mem_cgroup_read, 6007 }, 6008 { 6009 .name = "failcnt", 6010 .private = MEMFILE_PRIVATE(_MEM, RES_FAILCNT), 6011 .trigger = mem_cgroup_reset, 6012 .read = mem_cgroup_read, 6013 }, 6014 { 6015 .name = "stat", 6016 .read_seq_string = memcg_stat_show, 6017 }, 6018 { 6019 .name = "force_empty", 6020 .trigger = mem_cgroup_force_empty_write, 6021 }, 6022 { 6023 .name = "use_hierarchy", 6024 .flags = CFTYPE_INSANE, 6025 .write_u64 = mem_cgroup_hierarchy_write, 6026 .read_u64 = mem_cgroup_hierarchy_read, 6027 }, 6028 { 6029 .name = "swappiness", 6030 .read_u64 = mem_cgroup_swappiness_read, 6031 .write_u64 = mem_cgroup_swappiness_write, 6032 }, 6033 { 6034 .name = "move_charge_at_immigrate", 6035 .read_u64 = mem_cgroup_move_charge_read, 6036 .write_u64 = mem_cgroup_move_charge_write, 6037 }, 6038 { 6039 .name = "oom_control", 6040 .read_map = mem_cgroup_oom_control_read, 6041 .write_u64 = mem_cgroup_oom_control_write, 6042 .register_event = mem_cgroup_oom_register_event, 6043 .unregister_event = mem_cgroup_oom_unregister_event, 6044 .private = MEMFILE_PRIVATE(_OOM_TYPE, OOM_CONTROL), 6045 }, 6046 { 6047 .name = "pressure_level", 6048 .register_event = vmpressure_register_event, 6049 .unregister_event = vmpressure_unregister_event, 6050 }, 6051 #ifdef CONFIG_NUMA 6052 { 6053 .name = "numa_stat", 6054 .read_seq_string = memcg_numa_stat_show, 6055 }, 6056 #endif 6057 #ifdef CONFIG_MEMCG_KMEM 6058 { 6059 .name = "kmem.limit_in_bytes", 6060 .private = MEMFILE_PRIVATE(_KMEM, RES_LIMIT), 6061 .write_string = mem_cgroup_write, 6062 .read = mem_cgroup_read, 6063 }, 6064 { 6065 .name = "kmem.usage_in_bytes", 6066 .private = MEMFILE_PRIVATE(_KMEM, RES_USAGE), 6067 .read = mem_cgroup_read, 6068 }, 6069 { 6070 .name = "kmem.failcnt", 6071 .private = MEMFILE_PRIVATE(_KMEM, RES_FAILCNT), 6072 .trigger = mem_cgroup_reset, 6073 .read = mem_cgroup_read, 6074 }, 6075 { 6076 .name = "kmem.max_usage_in_bytes", 6077 .private = MEMFILE_PRIVATE(_KMEM, RES_MAX_USAGE), 6078 .trigger = mem_cgroup_reset, 6079 .read = mem_cgroup_read, 6080 }, 6081 #ifdef CONFIG_SLABINFO 6082 { 6083 .name = "kmem.slabinfo", 6084 .read_seq_string = mem_cgroup_slabinfo_read, 6085 }, 6086 #endif 6087 #endif 6088 { }, /* terminate */ 6089 }; 6090 6091 #ifdef CONFIG_MEMCG_SWAP 6092 static struct cftype memsw_cgroup_files[] = { 6093 { 6094 .name = "memsw.usage_in_bytes", 6095 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_USAGE), 6096 .read = mem_cgroup_read, 6097 .register_event = mem_cgroup_usage_register_event, 6098 .unregister_event = mem_cgroup_usage_unregister_event, 6099 }, 6100 { 6101 .name = "memsw.max_usage_in_bytes", 6102 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_MAX_USAGE), 6103 .trigger = mem_cgroup_reset, 6104 .read = mem_cgroup_read, 6105 }, 6106 { 6107 .name = "memsw.limit_in_bytes", 6108 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_LIMIT), 6109 .write_string = mem_cgroup_write, 6110 .read = mem_cgroup_read, 6111 }, 6112 { 6113 .name = "memsw.failcnt", 6114 .private = MEMFILE_PRIVATE(_MEMSWAP, RES_FAILCNT), 6115 .trigger = mem_cgroup_reset, 6116 .read = mem_cgroup_read, 6117 }, 6118 { }, /* terminate */ 6119 }; 6120 #endif 6121 static int alloc_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6122 { 6123 struct mem_cgroup_per_node *pn; 6124 struct mem_cgroup_per_zone *mz; 6125 int zone, tmp = node; 6126 /* 6127 * This routine is called against possible nodes. 6128 * But it's BUG to call kmalloc() against offline node. 6129 * 6130 * TODO: this routine can waste much memory for nodes which will 6131 * never be onlined. It's better to use memory hotplug callback 6132 * function. 6133 */ 6134 if (!node_state(node, N_NORMAL_MEMORY)) 6135 tmp = -1; 6136 pn = kzalloc_node(sizeof(*pn), GFP_KERNEL, tmp); 6137 if (!pn) 6138 return 1; 6139 6140 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6141 mz = &pn->zoneinfo[zone]; 6142 lruvec_init(&mz->lruvec); 6143 mz->usage_in_excess = 0; 6144 mz->on_tree = false; 6145 mz->memcg = memcg; 6146 } 6147 memcg->nodeinfo[node] = pn; 6148 return 0; 6149 } 6150 6151 static void free_mem_cgroup_per_zone_info(struct mem_cgroup *memcg, int node) 6152 { 6153 kfree(memcg->nodeinfo[node]); 6154 } 6155 6156 static struct mem_cgroup *mem_cgroup_alloc(void) 6157 { 6158 struct mem_cgroup *memcg; 6159 size_t size = memcg_size(); 6160 6161 /* Can be very big if nr_node_ids is very big */ 6162 if (size < PAGE_SIZE) 6163 memcg = kzalloc(size, GFP_KERNEL); 6164 else 6165 memcg = vzalloc(size); 6166 6167 if (!memcg) 6168 return NULL; 6169 6170 memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); 6171 if (!memcg->stat) 6172 goto out_free; 6173 spin_lock_init(&memcg->pcp_counter_lock); 6174 return memcg; 6175 6176 out_free: 6177 if (size < PAGE_SIZE) 6178 kfree(memcg); 6179 else 6180 vfree(memcg); 6181 return NULL; 6182 } 6183 6184 /* 6185 * At destroying mem_cgroup, references from swap_cgroup can remain. 6186 * (scanning all at force_empty is too costly...) 6187 * 6188 * Instead of clearing all references at force_empty, we remember 6189 * the number of reference from swap_cgroup and free mem_cgroup when 6190 * it goes down to 0. 6191 * 6192 * Removal of cgroup itself succeeds regardless of refs from swap. 6193 */ 6194 6195 static void __mem_cgroup_free(struct mem_cgroup *memcg) 6196 { 6197 int node; 6198 size_t size = memcg_size(); 6199 6200 mem_cgroup_remove_from_trees(memcg); 6201 6202 for_each_node(node) 6203 free_mem_cgroup_per_zone_info(memcg, node); 6204 6205 free_percpu(memcg->stat); 6206 6207 /* 6208 * We need to make sure that (at least for now), the jump label 6209 * destruction code runs outside of the cgroup lock. This is because 6210 * get_online_cpus(), which is called from the static_branch update, 6211 * can't be called inside the cgroup_lock. cpusets are the ones 6212 * enforcing this dependency, so if they ever change, we might as well. 6213 * 6214 * schedule_work() will guarantee this happens. Be careful if you need 6215 * to move this code around, and make sure it is outside 6216 * the cgroup_lock. 6217 */ 6218 disarm_static_keys(memcg); 6219 if (size < PAGE_SIZE) 6220 kfree(memcg); 6221 else 6222 vfree(memcg); 6223 } 6224 6225 /* 6226 * Returns the parent mem_cgroup in memcgroup hierarchy with hierarchy enabled. 6227 */ 6228 struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg) 6229 { 6230 if (!memcg->res.parent) 6231 return NULL; 6232 return mem_cgroup_from_res_counter(memcg->res.parent, res); 6233 } 6234 EXPORT_SYMBOL(parent_mem_cgroup); 6235 6236 static void __init mem_cgroup_soft_limit_tree_init(void) 6237 { 6238 struct mem_cgroup_tree_per_node *rtpn; 6239 struct mem_cgroup_tree_per_zone *rtpz; 6240 int tmp, node, zone; 6241 6242 for_each_node(node) { 6243 tmp = node; 6244 if (!node_state(node, N_NORMAL_MEMORY)) 6245 tmp = -1; 6246 rtpn = kzalloc_node(sizeof(*rtpn), GFP_KERNEL, tmp); 6247 BUG_ON(!rtpn); 6248 6249 soft_limit_tree.rb_tree_per_node[node] = rtpn; 6250 6251 for (zone = 0; zone < MAX_NR_ZONES; zone++) { 6252 rtpz = &rtpn->rb_tree_per_zone[zone]; 6253 rtpz->rb_root = RB_ROOT; 6254 spin_lock_init(&rtpz->lock); 6255 } 6256 } 6257 } 6258 6259 static struct cgroup_subsys_state * __ref 6260 mem_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) 6261 { 6262 struct mem_cgroup *memcg; 6263 long error = -ENOMEM; 6264 int node; 6265 6266 memcg = mem_cgroup_alloc(); 6267 if (!memcg) 6268 return ERR_PTR(error); 6269 6270 for_each_node(node) 6271 if (alloc_mem_cgroup_per_zone_info(memcg, node)) 6272 goto free_out; 6273 6274 /* root ? */ 6275 if (parent_css == NULL) { 6276 root_mem_cgroup = memcg; 6277 res_counter_init(&memcg->res, NULL); 6278 res_counter_init(&memcg->memsw, NULL); 6279 res_counter_init(&memcg->kmem, NULL); 6280 } 6281 6282 memcg->last_scanned_node = MAX_NUMNODES; 6283 INIT_LIST_HEAD(&memcg->oom_notify); 6284 memcg->move_charge_at_immigrate = 0; 6285 mutex_init(&memcg->thresholds_lock); 6286 spin_lock_init(&memcg->move_lock); 6287 vmpressure_init(&memcg->vmpressure); 6288 6289 return &memcg->css; 6290 6291 free_out: 6292 __mem_cgroup_free(memcg); 6293 return ERR_PTR(error); 6294 } 6295 6296 static int 6297 mem_cgroup_css_online(struct cgroup_subsys_state *css) 6298 { 6299 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6300 struct mem_cgroup *parent = mem_cgroup_from_css(css_parent(css)); 6301 int error = 0; 6302 6303 if (css->cgroup->id > MEM_CGROUP_ID_MAX) 6304 return -ENOSPC; 6305 6306 if (!parent) 6307 return 0; 6308 6309 mutex_lock(&memcg_create_mutex); 6310 6311 memcg->use_hierarchy = parent->use_hierarchy; 6312 memcg->oom_kill_disable = parent->oom_kill_disable; 6313 memcg->swappiness = mem_cgroup_swappiness(parent); 6314 6315 if (parent->use_hierarchy) { 6316 res_counter_init(&memcg->res, &parent->res); 6317 res_counter_init(&memcg->memsw, &parent->memsw); 6318 res_counter_init(&memcg->kmem, &parent->kmem); 6319 6320 /* 6321 * No need to take a reference to the parent because cgroup 6322 * core guarantees its existence. 6323 */ 6324 } else { 6325 res_counter_init(&memcg->res, NULL); 6326 res_counter_init(&memcg->memsw, NULL); 6327 res_counter_init(&memcg->kmem, NULL); 6328 /* 6329 * Deeper hierachy with use_hierarchy == false doesn't make 6330 * much sense so let cgroup subsystem know about this 6331 * unfortunate state in our controller. 6332 */ 6333 if (parent != root_mem_cgroup) 6334 mem_cgroup_subsys.broken_hierarchy = true; 6335 } 6336 6337 error = memcg_init_kmem(memcg, &mem_cgroup_subsys); 6338 mutex_unlock(&memcg_create_mutex); 6339 return error; 6340 } 6341 6342 /* 6343 * Announce all parents that a group from their hierarchy is gone. 6344 */ 6345 static void mem_cgroup_invalidate_reclaim_iterators(struct mem_cgroup *memcg) 6346 { 6347 struct mem_cgroup *parent = memcg; 6348 6349 while ((parent = parent_mem_cgroup(parent))) 6350 mem_cgroup_iter_invalidate(parent); 6351 6352 /* 6353 * if the root memcg is not hierarchical we have to check it 6354 * explicitely. 6355 */ 6356 if (!root_mem_cgroup->use_hierarchy) 6357 mem_cgroup_iter_invalidate(root_mem_cgroup); 6358 } 6359 6360 static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) 6361 { 6362 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6363 struct cgroup_subsys_state *iter; 6364 6365 kmem_cgroup_css_offline(memcg); 6366 6367 mem_cgroup_invalidate_reclaim_iterators(memcg); 6368 6369 /* 6370 * This requires that offlining is serialized. Right now that is 6371 * guaranteed because css_killed_work_fn() holds the cgroup_mutex. 6372 */ 6373 rcu_read_lock(); 6374 css_for_each_descendant_post(iter, css) { 6375 rcu_read_unlock(); 6376 mem_cgroup_reparent_charges(mem_cgroup_from_css(iter)); 6377 rcu_read_lock(); 6378 } 6379 rcu_read_unlock(); 6380 6381 mem_cgroup_destroy_all_caches(memcg); 6382 vmpressure_cleanup(&memcg->vmpressure); 6383 } 6384 6385 static void mem_cgroup_css_free(struct cgroup_subsys_state *css) 6386 { 6387 struct mem_cgroup *memcg = mem_cgroup_from_css(css); 6388 /* 6389 * XXX: css_offline() would be where we should reparent all 6390 * memory to prepare the cgroup for destruction. However, 6391 * memcg does not do css_tryget() and res_counter charging 6392 * under the same RCU lock region, which means that charging 6393 * could race with offlining. Offlining only happens to 6394 * cgroups with no tasks in them but charges can show up 6395 * without any tasks from the swapin path when the target 6396 * memcg is looked up from the swapout record and not from the 6397 * current task as it usually is. A race like this can leak 6398 * charges and put pages with stale cgroup pointers into 6399 * circulation: 6400 * 6401 * #0 #1 6402 * lookup_swap_cgroup_id() 6403 * rcu_read_lock() 6404 * mem_cgroup_lookup() 6405 * css_tryget() 6406 * rcu_read_unlock() 6407 * disable css_tryget() 6408 * call_rcu() 6409 * offline_css() 6410 * reparent_charges() 6411 * res_counter_charge() 6412 * css_put() 6413 * css_free() 6414 * pc->mem_cgroup = dead memcg 6415 * add page to lru 6416 * 6417 * The bulk of the charges are still moved in offline_css() to 6418 * avoid pinning a lot of pages in case a long-term reference 6419 * like a swapout record is deferring the css_free() to long 6420 * after offlining. But this makes sure we catch any charges 6421 * made after offlining: 6422 */ 6423 mem_cgroup_reparent_charges(memcg); 6424 6425 memcg_destroy_kmem(memcg); 6426 __mem_cgroup_free(memcg); 6427 } 6428 6429 #ifdef CONFIG_MMU 6430 /* Handlers for move charge at task migration. */ 6431 #define PRECHARGE_COUNT_AT_ONCE 256 6432 static int mem_cgroup_do_precharge(unsigned long count) 6433 { 6434 int ret = 0; 6435 int batch_count = PRECHARGE_COUNT_AT_ONCE; 6436 struct mem_cgroup *memcg = mc.to; 6437 6438 if (mem_cgroup_is_root(memcg)) { 6439 mc.precharge += count; 6440 /* we don't need css_get for root */ 6441 return ret; 6442 } 6443 /* try to charge at once */ 6444 if (count > 1) { 6445 struct res_counter *dummy; 6446 /* 6447 * "memcg" cannot be under rmdir() because we've already checked 6448 * by cgroup_lock_live_cgroup() that it is not removed and we 6449 * are still under the same cgroup_mutex. So we can postpone 6450 * css_get(). 6451 */ 6452 if (res_counter_charge(&memcg->res, PAGE_SIZE * count, &dummy)) 6453 goto one_by_one; 6454 if (do_swap_account && res_counter_charge(&memcg->memsw, 6455 PAGE_SIZE * count, &dummy)) { 6456 res_counter_uncharge(&memcg->res, PAGE_SIZE * count); 6457 goto one_by_one; 6458 } 6459 mc.precharge += count; 6460 return ret; 6461 } 6462 one_by_one: 6463 /* fall back to one by one charge */ 6464 while (count--) { 6465 if (signal_pending(current)) { 6466 ret = -EINTR; 6467 break; 6468 } 6469 if (!batch_count--) { 6470 batch_count = PRECHARGE_COUNT_AT_ONCE; 6471 cond_resched(); 6472 } 6473 ret = __mem_cgroup_try_charge(NULL, 6474 GFP_KERNEL, 1, &memcg, false); 6475 if (ret) 6476 /* mem_cgroup_clear_mc() will do uncharge later */ 6477 return ret; 6478 mc.precharge++; 6479 } 6480 return ret; 6481 } 6482 6483 /** 6484 * get_mctgt_type - get target type of moving charge 6485 * @vma: the vma the pte to be checked belongs 6486 * @addr: the address corresponding to the pte to be checked 6487 * @ptent: the pte to be checked 6488 * @target: the pointer the target page or swap ent will be stored(can be NULL) 6489 * 6490 * Returns 6491 * 0(MC_TARGET_NONE): if the pte is not a target for move charge. 6492 * 1(MC_TARGET_PAGE): if the page corresponding to this pte is a target for 6493 * move charge. if @target is not NULL, the page is stored in target->page 6494 * with extra refcnt got(Callers should handle it). 6495 * 2(MC_TARGET_SWAP): if the swap entry corresponding to this pte is a 6496 * target for charge migration. if @target is not NULL, the entry is stored 6497 * in target->ent. 6498 * 6499 * Called with pte lock held. 6500 */ 6501 union mc_target { 6502 struct page *page; 6503