Et bien prenons un téléphone. Vous démontez un téléphone des années 90 (20 ans ce n'est pas non plus "si" vieux"). Vous pourriez obtenir a peu près ça (en l’occurrence là c'est faux c'est un téléphone de 1960) :
Aujourd'hui vous obtenez ça :
Vous allez me dire ... "Et donc ?" il y a moins de composants a première vue. Je parle de "à première vue" parce que dans le seconde vue, il faut ajourd'hui ajouter quelque chose qui ressemble à ca, et ce plusieurs milliers de fois :
1/* 2 * Generic process-grouping system. 3 * 4 * Based originally on the cpuset system, extracted by Paul Menage 5 * Copyright (C) 2006 Google, Inc 6 * 7 * Notifications support 8 * Copyright (C) 2009 Nokia Corporation 9 * Author: Kirill A. Shutemov 10 * 11 * Copyright notices from the original cpuset code: 12 * -------------------------------------------------- 13 * Copyright (C) 2003 BULL SA. 14 * Copyright (C) 2004-2006 Silicon Graphics, Inc. 15 * 16 * Portions derived from Patrick Mochel's sysfs code. 17 * sysfs is Copyright (c) 2001-3 Patrick Mochel 18 * 19 * 2003-10-10 Written by Simon Derr. 20 * 2003-10-22 Updates by Stephen Hemminger. 21 * 2004 May-July Rework by Paul Jackson. 22 * --------------------------------------------------- 23 * 24 * This file is subject to the terms and conditions of the GNU General Public 25 * License. See the file COPYING in the main directory of the Linux 26 * distribution for more details. 27 */ 28 29#include30#include 31#include 32#include 33#include 34#include 35#include 36#include 37#include 38#include 39#include 40#include 41#include 42#include 43#include 44#include 45#include 46#include 47#include 48#include 49#include 50#include 51#include 52#include 53#include 54#include 55#include 56#include 57#include 58#include 59#include /* TODO: replace with more sophisticated array */ 60#include 61#include 62#include /* used in cgroup_attach_proc */ 63#include 64 65#include 66 67/* css deactivation bias, makes css->refcnt negative to deny new trygets */ 68#define CSS_DEACT_BIAS INT_MIN 69 70/* 71 * cgroup_mutex is the master lock. Any modification to cgroup or its 72 * hierarchy must be performed while holding it. 73 * 74 * cgroup_root_mutex nests inside cgroup_mutex and should be held to modify 75 * cgroupfs_root of any cgroup hierarchy - subsys list, flags, 76 * release_agent_path and so on. Modifying requires both cgroup_mutex and 77 * cgroup_root_mutex. Readers can acquire either of the two. This is to 78 * break the following locking order cycle. 79 * 80 * A. cgroup_mutex -> cred_guard_mutex -> s_type->i_mutex_key -> namespace_sem 81 * B. namespace_sem -> cgroup_mutex 82 * 83 * B happens only through cgroup_show_options() and using cgroup_root_mutex 84 * breaks it. 85 */ 86static DEFINE_MUTEX(cgroup_mutex); 87static DEFINE_MUTEX(cgroup_root_mutex); 88 89/* 90 * Generate an array of cgroup subsystem pointers. At boot time, this is 91 * populated with the built in subsystems, and modular subsystems are 92 * registered after that. The mutable section of this array is protected by 93 * cgroup_mutex. 94 */ 95#define SUBSYS(_x) [_x ## _subsys_id] = &_x ## _subsys, 96#define IS_SUBSYS_ENABLED(option) IS_BUILTIN(option) 97static struct cgroup_subsys *subsys[CGROUP_SUBSYS_COUNT] = { 98#include 99}; 100 101#define MAX_CGROUP_ROOT_NAMELEN 64 102 103/* 104 * A cgroupfs_root represents the root of a cgroup hierarchy, 105 * and may be associated with a superblock to form an active 106 * hierarchy 107 */ 108struct cgroupfs_root { 109 struct super_block *sb; 110 111 /* 112 * The bitmask of subsystems intended to be attached to this 113 * hierarchy 114 */ 115 unsigned long subsys_mask; 116 117 /* Unique id for this hierarchy. */ 118 int hierarchy_id; 119 120 /* The bitmask of subsystems currently attached to this hierarchy */ 121 unsigned long actual_subsys_mask; 122 123 /* A list running through the attached subsystems */ 124 struct list_head subsys_list; 125 126 /* The root cgroup for this hierarchy */ 127 struct cgroup top_cgroup; 128 129 /* Tracks how many cgroups are currently defined in hierarchy.*/ 130 int number_of_cgroups; 131 132 /* A list running through the active hierarchies */ 133 struct list_head root_list; 134 135 /* All cgroups on this root, cgroup_mutex protected */ 136 struct list_head allcg_list; 137 138 /* Hierarchy-specific flags */ 139 unsigned long flags; 140 141 /* IDs for cgroups in this hierarchy */ 142 struct ida cgroup_ida; 143 144 /* The path to use for release notifications. */ 145 char release_agent_path[PATH_MAX]; 146 147 /* The name for this hierarchy - may be empty */ 148 char name[MAX_CGROUP_ROOT_NAMELEN]; 149}; 150 151/* 152 * The "rootnode" hierarchy is the "dummy hierarchy", reserved for the 153 * subsystems that are otherwise unattached - it never has more than a 154 * single cgroup, and all tasks are part of that cgroup. 155 */ 156static struct cgroupfs_root rootnode; 157 158/* 159 * cgroupfs file entry, pointed to from leaf dentry->d_fsdata. 160 */ 161struct cfent { 162 struct list_head node; 163 struct dentry *dentry; 164 struct cftype *type; 165 166 /* file xattrs */ 167 struct simple_xattrs xattrs; 168}; 169 170/* 171 * CSS ID -- ID per subsys's Cgroup Subsys State(CSS). used only when 172 * cgroup_subsys->use_id != 0. 173 */ 174#define CSS_ID_MAX (65535) 175struct css_id { 176 /* 177 * The css to which this ID points. This pointer is set to valid value 178 * after cgroup is populated. If cgroup is removed, this will be NULL. 179 * This pointer is expected to be RCU-safe because destroy() 180 * is called after synchronize_rcu(). But for safe use, css_tryget() 181 * should be used for avoiding race. 182 */ 183 struct cgroup_subsys_state __rcu *css; 184 /* 185 * ID of this css. 186 */ 187 unsigned short id; 188 /* 189 * Depth in hierarchy which this ID belongs to. 190 */ 191 unsigned short depth; 192 /* 193 * ID is freed by RCU. (and lookup routine is RCU safe.) 194 */ 195 struct rcu_head rcu_head; 196 /* 197 * Hierarchy of CSS ID belongs to. 198 */ 199 unsigned short stack[0]; /* Array of Length (depth+1) */ 200}; 201 202/* 203 * cgroup_event represents events which userspace want to receive. 204 */ 205struct cgroup_event { 206 /* 207 * Cgroup which the event belongs to. 208 */ 209 struct cgroup *cgrp; 210 /* 211 * Control file which the event associated. 212 */ 213 struct cftype *cft; 214 /* 215 * eventfd to signal userspace about the event. 216 */ 217 struct eventfd_ctx *eventfd; 218 /* 219 * Each of these stored in a list by the cgroup. 220 */ 221 struct list_head list; 222 /* 223 * All fields below needed to unregister event when 224 * userspace closes eventfd. 225 */ 226 poll_table pt; 227 wait_queue_head_t *wqh; 228 wait_queue_t wait; 229 struct work_struct remove; 230}; 231 232/* The list of hierarchy roots */ 233 234static LIST_HEAD(roots); 235static int root_count; 236 237static DEFINE_IDA(hierarchy_ida); 238static int next_hierarchy_id; 239static DEFINE_SPINLOCK(hierarchy_id_lock); 240 241/* dummytop is a shorthand for the dummy hierarchy's top cgroup */ 242#define dummytop (&rootnode.top_cgroup) 243 244/* This flag indicates whether tasks in the fork and exit paths should 245 * check for fork/exit handlers to call. This avoids us having to do 246 * extra work in the fork/exit path if none of the subsystems need to 247 * be called. 248 */ 249static int need_forkexit_callback __read_mostly; 250 251static int cgroup_destroy_locked(struct cgroup *cgrp); 252static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 253 struct cftype cfts[], bool is_add); 254 255#ifdef CONFIG_PROVE_LOCKING 256int cgroup_lock_is_held(void) 257{ 258 return lockdep_is_held(&cgroup_mutex); 259} 260#else /* #ifdef CONFIG_PROVE_LOCKING */ 261int cgroup_lock_is_held(void) 262{ 263 return mutex_is_locked(&cgroup_mutex); 264} 265#endif /* #else #ifdef CONFIG_PROVE_LOCKING */ 266 267EXPORT_SYMBOL_GPL(cgroup_lock_is_held); 268 269static int css_unbias_refcnt(int refcnt) 270{ 271 return refcnt >= 0 ? refcnt : refcnt - CSS_DEACT_BIAS; 272} 273 274/* the current nr of refs, always >= 0 whether @css is deactivated or not */ 275static int css_refcnt(struct cgroup_subsys_state *css) 276{ 277 int v = atomic_read(&css->refcnt); 278 279 return css_unbias_refcnt(v); 280} 281 282/* convenient tests for these bits */ 283inline int cgroup_is_removed(const struct cgroup *cgrp) 284{ 285 return test_bit(CGRP_REMOVED, &cgrp->flags); 286} 287 288/* bits in struct cgroupfs_root flags field */ 289enum { 290 ROOT_NOPREFIX, /* mounted subsystems have no named prefix */ 291 ROOT_XATTR, /* supports extended attributes */ 292}; 293 294static int cgroup_is_releasable(const struct cgroup *cgrp) 295{ 296 const int bits = 297 (1 << CGRP_RELEASABLE) | 298 (1 << CGRP_NOTIFY_ON_RELEASE); 299 return (cgrp->flags & bits) == bits; 300} 301 302static int notify_on_release(const struct cgroup *cgrp) 303{ 304 return test_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 305} 306 307/* 308 * for_each_subsys() allows you to iterate on each subsystem attached to 309 * an active hierarchy 310 */ 311#define for_each_subsys(_root, _ss) \ 312list_for_each_entry(_ss, &_root->subsys_list, sibling) 313 314/* for_each_active_root() allows you to iterate across the active hierarchies */ 315#define for_each_active_root(_root) \ 316list_for_each_entry(_root, &roots, root_list) 317 318static inline struct cgroup *__d_cgrp(struct dentry *dentry) 319{ 320 return dentry->d_fsdata; 321} 322 323static inline struct cfent *__d_cfe(struct dentry *dentry) 324{ 325 return dentry->d_fsdata; 326} 327 328static inline struct cftype *__d_cft(struct dentry *dentry) 329{ 330 return __d_cfe(dentry)->type; 331} 332 333/* the list of cgroups eligible for automatic release. Protected by 334 * release_list_lock */ 335static LIST_HEAD(release_list); 336static DEFINE_RAW_SPINLOCK(release_list_lock); 337static void cgroup_release_agent(struct work_struct *work); 338static DECLARE_WORK(release_agent_work, cgroup_release_agent); 339static void check_for_release(struct cgroup *cgrp); 340 341/* Link structure for associating css_set objects with cgroups */ 342struct cg_cgroup_link { 343 /* 344 * List running through cg_cgroup_links associated with a 345 * cgroup, anchored on cgroup->css_sets 346 */ 347 struct list_head cgrp_link_list; 348 struct cgroup *cgrp; 349 /* 350 * List running through cg_cgroup_links pointing at a 351 * single css_set object, anchored on css_set->cg_links 352 */ 353 struct list_head cg_link_list; 354 struct css_set *cg; 355}; 356 357/* The default css_set - used by init and its children prior to any 358 * hierarchies being mounted. It contains a pointer to the root state 359 * for each subsystem. Also used to anchor the list of css_sets. Not 360 * reference-counted, to improve performance when child cgroups 361 * haven't been created. 362 */ 363 364static struct css_set init_css_set; 365static struct cg_cgroup_link init_css_set_link; 366 367static int cgroup_init_idr(struct cgroup_subsys *ss, 368 struct cgroup_subsys_state *css); 369 370/* css_set_lock protects the list of css_set objects, and the 371 * chain of tasks off each css_set. Nests outside task->alloc_lock 372 * due to cgroup_iter_start() */ 373static DEFINE_RWLOCK(css_set_lock); 374static int css_set_count; 375 376/* 377 * hash table for cgroup groups. This improves the performance to find 378 * an existing css_set. This hash doesn't (currently) take into 379 * account cgroups in empty hierarchies. 380 */ 381#define CSS_SET_HASH_BITS 7 382static DEFINE_HASHTABLE(css_set_table, CSS_SET_HASH_BITS); 383 384static unsigned long css_set_hash(struct cgroup_subsys_state *css[]) 385{ 386 int i; 387 unsigned long key = 0UL; 388 389 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) 390 key += (unsigned long)css[i]; 391 key = (key >> 16) ^ key; 392 393 return key; 394} 395 396/* We don't maintain the lists running through each css_set to its 397 * task until after the first call to cgroup_iter_start(). This 398 * reduces the fork()/exit() overhead for people who have cgroups 399 * compiled into their kernel but not actually in use */ 400static int use_task_css_set_links __read_mostly; 401 402static void __put_css_set(struct css_set *cg, int taskexit) 403{ 404 struct cg_cgroup_link *link; 405 struct cg_cgroup_link *saved_link; 406 /* 407 * Ensure that the refcount doesn't hit zero while any readers 408 * can see it. Similar to atomic_dec_and_lock(), but for an 409 * rwlock 410 */ 411 if (atomic_add_unless(&cg->refcount, -1, 1)) 412 return; 413 write_lock(&css_set_lock); 414 if (!atomic_dec_and_test(&cg->refcount)) { 415 write_unlock(&css_set_lock); 416 return; 417 } 418 419 /* This css_set is dead. unlink it and release cgroup refcounts */ 420 hash_del(&cg->hlist); 421 css_set_count--; 422 423 list_for_each_entry_safe(link, saved_link, &cg->cg_links, 424 cg_link_list) { 425 struct cgroup *cgrp = link->cgrp; 426 list_del(&link->cg_link_list); 427 list_del(&link->cgrp_link_list); 428 429 /* 430 * We may not be holding cgroup_mutex, and if cgrp->count is 431 * dropped to 0 the cgroup can be destroyed at any time, hence 432 * rcu_read_lock is used to keep it alive. 433 */ 434 rcu_read_lock(); 435 if (atomic_dec_and_test(&cgrp->count) && 436 notify_on_release(cgrp)) { 437 if (taskexit) 438 set_bit(CGRP_RELEASABLE, &cgrp->flags); 439 check_for_release(cgrp); 440 } 441 rcu_read_unlock(); 442 443 kfree(link); 444 } 445 446 write_unlock(&css_set_lock); 447 kfree_rcu(cg, rcu_head); 448} 449 450/* 451 * refcounted get/put for css_set objects 452 */ 453static inline void get_css_set(struct css_set *cg) 454{ 455 atomic_inc(&cg->refcount); 456} 457 458static inline void put_css_set(struct css_set *cg) 459{ 460 __put_css_set(cg, 0); 461} 462 463static inline void put_css_set_taskexit(struct css_set *cg) 464{ 465 __put_css_set(cg, 1); 466} 467 468/* 469 * compare_css_sets - helper function for find_existing_css_set(). 470 * @cg: candidate css_set being tested 471 * @old_cg: existing css_set for a task 472 * @new_cgrp: cgroup that's being entered by the task 473 * @template: desired set of css pointers in css_set (pre-calculated) 474 * 475 * Returns true if "cg" matches "old_cg" except for the hierarchy 476 * which "new_cgrp" belongs to, for which it should match "new_cgrp". 477 */ 478static bool compare_css_sets(struct css_set *cg, 479 struct css_set *old_cg, 480 struct cgroup *new_cgrp, 481 struct cgroup_subsys_state *template[]) 482{ 483 struct list_head *l1, *l2; 484 485 if (memcmp(template, cg->subsys, sizeof(cg->subsys))) { 486 /* Not all subsystems matched */ 487 return false; 488 } 489 490 /* 491 * Compare cgroup pointers in order to distinguish between 492 * different cgroups in heirarchies with no subsystems. We 493 * could get by with just this check alone (and skip the 494 * memcmp above) but on most setups the memcmp check will 495 * avoid the need for this more expensive check on almost all 496 * candidates. 497 */ 498 499 l1 = &cg->cg_links; 500 l2 = &old_cg->cg_links; 501 while (1) { 502 struct cg_cgroup_link *cgl1, *cgl2; 503 struct cgroup *cg1, *cg2; 504 505 l1 = l1->next; 506 l2 = l2->next; 507 /* See if we reached the end - both lists are equal length. */ 508 if (l1 == &cg->cg_links) { 509 BUG_ON(l2 != &old_cg->cg_links); 510 break; 511 } else { 512 BUG_ON(l2 == &old_cg->cg_links); 513 } 514 /* Locate the cgroups associated with these links. */ 515 cgl1 = list_entry(l1, struct cg_cgroup_link, cg_link_list); 516 cgl2 = list_entry(l2, struct cg_cgroup_link, cg_link_list); 517 cg1 = cgl1->cgrp; 518 cg2 = cgl2->cgrp; 519 /* Hierarchies should be linked in the same order. */ 520 BUG_ON(cg1->root != cg2->root); 521 522 /* 523 * If this hierarchy is the hierarchy of the cgroup 524 * that's changing, then we need to check that this 525 * css_set points to the new cgroup; if it's any other 526 * hierarchy, then this css_set should point to the 527 * same cgroup as the old css_set. 528 */ 529 if (cg1->root == new_cgrp->root) { 530 if (cg1 != new_cgrp) 531 return false; 532 } else { 533 if (cg1 != cg2) 534 return false; 535 } 536 } 537 return true; 538} 539 540/* 541 * find_existing_css_set() is a helper for 542 * find_css_set(), and checks to see whether an existing 543 * css_set is suitable. 544 * 545 * oldcg: the cgroup group that we're using before the cgroup 546 * transition 547 * 548 * cgrp: the cgroup that we're moving into 549 * 550 * template: location in which to build the desired set of subsystem 551 * state objects for the new cgroup group 552 */ 553static struct css_set *find_existing_css_set( 554 struct css_set *oldcg, 555 struct cgroup *cgrp, 556 struct cgroup_subsys_state *template[]) 557{ 558 int i; 559 struct cgroupfs_root *root = cgrp->root; 560 struct css_set *cg; 561 unsigned long key; 562 563 /* 564 * Build the set of subsystem state objects that we want to see in the 565 * new css_set. while subsystems can change globally, the entries here 566 * won't change, so no need for locking. 567 */ 568 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 569 if (root->subsys_mask & (1UL << i)) { 570 /* Subsystem is in this hierarchy. So we want 571 * the subsystem state from the new 572 * cgroup */ 573 template[i] = cgrp->subsys[i]; 574 } else { 575 /* Subsystem is not in this hierarchy, so we 576 * don't want to change the subsystem state */ 577 template[i] = oldcg->subsys[i]; 578 } 579 } 580 581 key = css_set_hash(template); 582 hash_for_each_possible(css_set_table, cg, hlist, key) { 583 if (!compare_css_sets(cg, oldcg, cgrp, template)) 584 continue; 585 586 /* This css_set matches what we need */ 587 return cg; 588 } 589 590 /* No existing cgroup group matched */ 591 return NULL; 592} 593 594static void free_cg_links(struct list_head *tmp) 595{ 596 struct cg_cgroup_link *link; 597 struct cg_cgroup_link *saved_link; 598 599 list_for_each_entry_safe(link, saved_link, tmp, cgrp_link_list) { 600 list_del(&link->cgrp_link_list); 601 kfree(link); 602 } 603} 604 605/* 606 * allocate_cg_links() allocates "count" cg_cgroup_link structures 607 * and chains them on tmp through their cgrp_link_list fields. Returns 0 on 608 * success or a negative error 609 */ 610static int allocate_cg_links(int count, struct list_head *tmp) 611{ 612 struct cg_cgroup_link *link; 613 int i; 614 INIT_LIST_HEAD(tmp); 615 for (i = 0; i < count; i++) { 616 link = kmalloc(sizeof(*link), GFP_KERNEL); 617 if (!link) { 618 free_cg_links(tmp); 619 return -ENOMEM; 620 } 621 list_add(&link->cgrp_link_list, tmp); 622 } 623 return 0; 624} 625 626/** 627 * link_css_set - a helper function to link a css_set to a cgroup 628 * @tmp_cg_links: cg_cgroup_link objects allocated by allocate_cg_links() 629 * @cg: the css_set to be linked 630 * @cgrp: the destination cgroup 631 */ 632static void link_css_set(struct list_head *tmp_cg_links, 633 struct css_set *cg, struct cgroup *cgrp) 634{ 635 struct cg_cgroup_link *link; 636 637 BUG_ON(list_empty(tmp_cg_links)); 638 link = list_first_entry(tmp_cg_links, struct cg_cgroup_link, 639 cgrp_link_list); 640 link->cg = cg; 641 link->cgrp = cgrp; 642 atomic_inc(&cgrp->count); 643 list_move(&link->cgrp_link_list, &cgrp->css_sets); 644 /* 645 * Always add links to the tail of the list so that the list 646 * is sorted by order of hierarchy creation 647 */ 648 list_add_tail(&link->cg_link_list, &cg->cg_links); 649} 650 651/* 652 * find_css_set() takes an existing cgroup group and a 653 * cgroup object, and returns a css_set object that's 654 * equivalent to the old group, but with the given cgroup 655 * substituted into the appropriate hierarchy. Must be called with 656 * cgroup_mutex held 657 */ 658static struct css_set *find_css_set( 659 struct css_set *oldcg, struct cgroup *cgrp) 660{ 661 struct css_set *res; 662 struct cgroup_subsys_state *template[CGROUP_SUBSYS_COUNT]; 663 664 struct list_head tmp_cg_links; 665 666 struct cg_cgroup_link *link; 667 unsigned long key; 668 669 /* First see if we already have a cgroup group that matches 670 * the desired set */ 671 read_lock(&css_set_lock); 672 res = find_existing_css_set(oldcg, cgrp, template); 673 if (res) 674 get_css_set(res); 675 read_unlock(&css_set_lock); 676 677 if (res) 678 return res; 679 680 res = kmalloc(sizeof(*res), GFP_KERNEL); 681 if (!res) 682 return NULL; 683 684 /* Allocate all the cg_cgroup_link objects that we'll need */ 685 if (allocate_cg_links(root_count, &tmp_cg_links) < 0) { 686 kfree(res); 687 return NULL; 688 } 689 690 atomic_set(&res->refcount, 1); 691 INIT_LIST_HEAD(&res->cg_links); 692 INIT_LIST_HEAD(&res->tasks); 693 INIT_HLIST_NODE(&res->hlist); 694 695 /* Copy the set of subsystem state objects generated in 696 * find_existing_css_set() */ 697 memcpy(res->subsys, template, sizeof(res->subsys)); 698 699 write_lock(&css_set_lock); 700 /* Add reference counts and links from the new css_set. */ 701 list_for_each_entry(link, &oldcg->cg_links, cg_link_list) { 702 struct cgroup *c = link->cgrp; 703 if (c->root == cgrp->root) 704 c = cgrp; 705 link_css_set(&tmp_cg_links, res, c); 706 } 707 708 BUG_ON(!list_empty(&tmp_cg_links)); 709 710 css_set_count++; 711 712 /* Add this cgroup group to the hash table */ 713 key = css_set_hash(res->subsys); 714 hash_add(css_set_table, &res->hlist, key); 715 716 write_unlock(&css_set_lock); 717 718 return res; 719} 720 721/* 722 * Return the cgroup for "task" from the given hierarchy. Must be 723 * called with cgroup_mutex held. 724 */ 725static struct cgroup *task_cgroup_from_root(struct task_struct *task, 726 struct cgroupfs_root *root) 727{ 728 struct css_set *css; 729 struct cgroup *res = NULL; 730 731 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 732 read_lock(&css_set_lock); 733 /* 734 * No need to lock the task - since we hold cgroup_mutex the 735 * task can't change groups, so the only thing that can happen 736 * is that it exits and its css is set back to init_css_set. 737 */ 738 css = task->cgroups; 739 if (css == &init_css_set) { 740 res = &root->top_cgroup; 741 } else { 742 struct cg_cgroup_link *link; 743 list_for_each_entry(link, &css->cg_links, cg_link_list) { 744 struct cgroup *c = link->cgrp; 745 if (c->root == root) { 746 res = c; 747 break; 748 } 749 } 750 } 751 read_unlock(&css_set_lock); 752 BUG_ON(!res); 753 return res; 754} 755 756/* 757 * There is one global cgroup mutex. We also require taking 758 * task_lock() when dereferencing a task's cgroup subsys pointers. 759 * See "The task_lock() exception", at the end of this comment. 760 * 761 * A task must hold cgroup_mutex to modify cgroups. 762 * 763 * Any task can increment and decrement the count field without lock. 764 * So in general, code holding cgroup_mutex can't rely on the count 765 * field not changing. However, if the count goes to zero, then only 766 * cgroup_attach_task() can increment it again. Because a count of zero 767 * means that no tasks are currently attached, therefore there is no 768 * way a task attached to that cgroup can fork (the other way to 769 * increment the count). So code holding cgroup_mutex can safely 770 * assume that if the count is zero, it will stay zero. Similarly, if 771 * a task holds cgroup_mutex on a cgroup with zero count, it 772 * knows that the cgroup won't be removed, as cgroup_rmdir() 773 * needs that mutex. 774 * 775 * The fork and exit callbacks cgroup_fork() and cgroup_exit(), don't 776 * (usually) take cgroup_mutex. These are the two most performance 777 * critical pieces of code here. The exception occurs on cgroup_exit(), 778 * when a task in a notify_on_release cgroup exits. Then cgroup_mutex 779 * is taken, and if the cgroup count is zero, a usermode call made 780 * to the release agent with the name of the cgroup (path relative to 781 * the root of cgroup file system) as the argument. 782 * 783 * A cgroup can only be deleted if both its 'count' of using tasks 784 * is zero, and its list of 'children' cgroups is empty. Since all 785 * tasks in the system use _some_ cgroup, and since there is always at 786 * least one task in the system (init, pid == 1), therefore, top_cgroup 787 * always has either children cgroups and/or using tasks. So we don't 788 * need a special hack to ensure that top_cgroup cannot be deleted. 789 * 790 * The task_lock() exception 791 * 792 * The need for this exception arises from the action of 793 * cgroup_attach_task(), which overwrites one task's cgroup pointer with 794 * another. It does so using cgroup_mutex, however there are 795 * several performance critical places that need to reference 796 * task->cgroup without the expense of grabbing a system global 797 * mutex. Therefore except as noted below, when dereferencing or, as 798 * in cgroup_attach_task(), modifying a task's cgroup pointer we use 799 * task_lock(), which acts on a spinlock (task->alloc_lock) already in 800 * the task_struct routinely used for such matters. 801 * 802 * P.S. One more locking exception. RCU is used to guard the 803 * update of a tasks cgroup pointer by cgroup_attach_task() 804 */ 805 806/** 807 * cgroup_lock - lock out any changes to cgroup structures 808 * 809 */ 810void cgroup_lock(void) 811{ 812 mutex_lock(&cgroup_mutex); 813} 814EXPORT_SYMBOL_GPL(cgroup_lock); 815 816/** 817 * cgroup_unlock - release lock on cgroup changes 818 * 819 * Undo the lock taken in a previous cgroup_lock() call. 820 */ 821void cgroup_unlock(void) 822{ 823 mutex_unlock(&cgroup_mutex); 824} 825EXPORT_SYMBOL_GPL(cgroup_unlock); 826 827/* 828 * A couple of forward declarations required, due to cyclic reference loop: 829 * cgroup_mkdir -> cgroup_create -> cgroup_populate_dir -> 830 * cgroup_add_file -> cgroup_create_file -> cgroup_dir_inode_operations 831 * -> cgroup_mkdir. 832 */ 833 834static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode); 835static struct dentry *cgroup_lookup(struct inode *, struct dentry *, unsigned int); 836static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry); 837static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 838 unsigned long subsys_mask); 839static const struct inode_operations cgroup_dir_inode_operations; 840static const struct file_operations proc_cgroupstats_operations; 841 842static struct backing_dev_info cgroup_backing_dev_info = { 843 .name = "cgroup", 844 .capabilities = BDI_CAP_NO_ACCT_AND_WRITEBACK, 845}; 846 847static int alloc_css_id(struct cgroup_subsys *ss, 848 struct cgroup *parent, struct cgroup *child); 849 850static struct inode *cgroup_new_inode(umode_t mode, struct super_block *sb) 851{ 852 struct inode *inode = new_inode(sb); 853 854 if (inode) { 855 inode->i_ino = get_next_ino(); 856 inode->i_mode = mode; 857 inode->i_uid = current_fsuid(); 858 inode->i_gid = current_fsgid(); 859 inode->i_atime = inode->i_mtime = inode->i_ctime = CURRENT_TIME; 860 inode->i_mapping->backing_dev_info = &cgroup_backing_dev_info; 861 } 862 return inode; 863} 864 865static void cgroup_free_fn(struct work_struct *work) 866{ 867 struct cgroup *cgrp = container_of(work, struct cgroup, free_work); 868 struct cgroup_subsys *ss; 869 870 mutex_lock(&cgroup_mutex); 871 /* 872 * Release the subsystem state objects. 873 */ 874 for_each_subsys(cgrp->root, ss) 875 ss->css_free(cgrp); 876 877 cgrp->root->number_of_cgroups--; 878 mutex_unlock(&cgroup_mutex); 879 880 /* 881 * Drop the active superblock reference that we took when we 882 * created the cgroup 883 */ 884 deactivate_super(cgrp->root->sb); 885 886 /* 887 * if we're getting rid of the cgroup, refcount should ensure 888 * that there are no pidlists left. 889 */ 890 BUG_ON(!list_empty(&cgrp->pidlists)); 891 892 simple_xattrs_free(&cgrp->xattrs); 893 894 ida_simple_remove(&cgrp->root->cgroup_ida, cgrp->id); 895 kfree(cgrp); 896} 897 898static void cgroup_free_rcu(struct rcu_head *head) 899{ 900 struct cgroup *cgrp = container_of(head, struct cgroup, rcu_head); 901 902 schedule_work(&cgrp->free_work); 903} 904 905static void cgroup_diput(struct dentry *dentry, struct inode *inode) 906{ 907 /* is dentry a directory ? if so, kfree() associated cgroup */ 908 if (S_ISDIR(inode->i_mode)) { 909 struct cgroup *cgrp = dentry->d_fsdata; 910 911 BUG_ON(!(cgroup_is_removed(cgrp))); 912 call_rcu(&cgrp->rcu_head, cgroup_free_rcu); 913 } else { 914 struct cfent *cfe = __d_cfe(dentry); 915 struct cgroup *cgrp = dentry->d_parent->d_fsdata; 916 917 WARN_ONCE(!list_empty(&cfe->node) && 918 cgrp != &cgrp->root->top_cgroup, 919 "cfe still linked for %s\n", cfe->type->name); 920 simple_xattrs_free(&cfe->xattrs); 921 kfree(cfe); 922 } 923 iput(inode); 924} 925 926static int cgroup_delete(const struct dentry *d) 927{ 928 return 1; 929} 930 931static void remove_dir(struct dentry *d) 932{ 933 struct dentry *parent = dget(d->d_parent); 934 935 d_delete(d); 936 simple_rmdir(parent->d_inode, d); 937 dput(parent); 938} 939 940static void cgroup_rm_file(struct cgroup *cgrp, const struct cftype *cft) 941{ 942 struct cfent *cfe; 943 944 lockdep_assert_held(&cgrp->dentry->d_inode->i_mutex); 945 lockdep_assert_held(&cgroup_mutex); 946 947 /* 948 * If we're doing cleanup due to failure of cgroup_create(), 949 * the corresponding @cfe may not exist. 950 */ 951 list_for_each_entry(cfe, &cgrp->files, node) { 952 struct dentry *d = cfe->dentry; 953 954 if (cft && cfe->type != cft) 955 continue; 956 957 dget(d); 958 d_delete(d); 959 simple_unlink(cgrp->dentry->d_inode, d); 960 list_del_init(&cfe->node); 961 dput(d); 962 963 break; 964 } 965} 966 967/** 968 * cgroup_clear_directory - selective removal of base and subsystem files 969 * @dir: directory containing the files 970 * @base_files: true if the base files should be removed 971 * @subsys_mask: mask of the subsystem ids whose files should be removed 972 */ 973static void cgroup_clear_directory(struct dentry *dir, bool base_files, 974 unsigned long subsys_mask) 975{ 976 struct cgroup *cgrp = __d_cgrp(dir); 977 struct cgroup_subsys *ss; 978 979 for_each_subsys(cgrp->root, ss) { 980 struct cftype_set *set; 981 if (!test_bit(ss->subsys_id, &subsys_mask)) 982 continue; 983 list_for_each_entry(set, &ss->cftsets, node) 984 cgroup_addrm_files(cgrp, NULL, set->cfts, false); 985 } 986 if (base_files) { 987 while (!list_empty(&cgrp->files)) 988 cgroup_rm_file(cgrp, NULL); 989 } 990} 991 992/* 993 * NOTE : the dentry must have been dget()'ed 994 */ 995static void cgroup_d_remove_dir(struct dentry *dentry) 996{ 997 struct dentry *parent; 998 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 999 1000 cgroup_clear_directory(dentry, true, root->subsys_mask); 1001 1002 parent = dentry->d_parent; 1003 spin_lock(&parent->d_lock); 1004 spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED); 1005 list_del_init(&dentry->d_u.d_child); 1006 spin_unlock(&dentry->d_lock); 1007 spin_unlock(&parent->d_lock); 1008 remove_dir(dentry); 1009} 1010 1011/* 1012 * Call with cgroup_mutex held. Drops reference counts on modules, including 1013 * any duplicate ones that parse_cgroupfs_options took. If this function 1014 * returns an error, no reference counts are touched. 1015 */ 1016static int rebind_subsystems(struct cgroupfs_root *root, 1017 unsigned long final_subsys_mask) 1018{ 1019 unsigned long added_mask, removed_mask; 1020 struct cgroup *cgrp = &root->top_cgroup; 1021 int i; 1022 1023 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1024 BUG_ON(!mutex_is_locked(&cgroup_root_mutex)); 1025 1026 removed_mask = root->actual_subsys_mask & ~final_subsys_mask; 1027 added_mask = final_subsys_mask & ~root->actual_subsys_mask; 1028 /* Check that any added subsystems are currently free */ 1029 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1030 unsigned long bit = 1UL << i; 1031 struct cgroup_subsys *ss = subsys[i]; 1032 if (!(bit & added_mask)) 1033 continue; 1034 /* 1035 * Nobody should tell us to do a subsys that doesn't exist: 1036 * parse_cgroupfs_options should catch that case and refcounts 1037 * ensure that subsystems won't disappear once selected. 1038 */ 1039 BUG_ON(ss == NULL); 1040 if (ss->root != &rootnode) { 1041 /* Subsystem isn't free */ 1042 return -EBUSY; 1043 } 1044 } 1045 1046 /* Currently we don't handle adding/removing subsystems when 1047 * any child cgroups exist. This is theoretically supportable 1048 * but involves complex error handling, so it's being left until 1049 * later */ 1050 if (root->number_of_cgroups > 1) 1051 return -EBUSY; 1052 1053 /* Process each subsystem */ 1054 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1055 struct cgroup_subsys *ss = subsys[i]; 1056 unsigned long bit = 1UL << i; 1057 if (bit & added_mask) { 1058 /* We're binding this subsystem to this hierarchy */ 1059 BUG_ON(ss == NULL); 1060 BUG_ON(cgrp->subsys[i]); 1061 BUG_ON(!dummytop->subsys[i]); 1062 BUG_ON(dummytop->subsys[i]->cgroup != dummytop); 1063 cgrp->subsys[i] = dummytop->subsys[i]; 1064 cgrp->subsys[i]->cgroup = cgrp; 1065 list_move(&ss->sibling, &root->subsys_list); 1066 ss->root = root; 1067 if (ss->bind) 1068 ss->bind(cgrp); 1069 /* refcount was already taken, and we're keeping it */ 1070 } else if (bit & removed_mask) { 1071 /* We're removing this subsystem */ 1072 BUG_ON(ss == NULL); 1073 BUG_ON(cgrp->subsys[i] != dummytop->subsys[i]); 1074 BUG_ON(cgrp->subsys[i]->cgroup != cgrp); 1075 if (ss->bind) 1076 ss->bind(dummytop); 1077 dummytop->subsys[i]->cgroup = dummytop; 1078 cgrp->subsys[i] = NULL; 1079 subsys[i]->root = &rootnode; 1080 list_move(&ss->sibling, &rootnode.subsys_list); 1081 /* subsystem is now free - drop reference on module */ 1082 module_put(ss->module); 1083 } else if (bit & final_subsys_mask) { 1084 /* Subsystem state should already exist */ 1085 BUG_ON(ss == NULL); 1086 BUG_ON(!cgrp->subsys[i]); 1087 /* 1088 * a refcount was taken, but we already had one, so 1089 * drop the extra reference. 1090 */ 1091 module_put(ss->module); 1092#ifdef CONFIG_MODULE_UNLOAD 1093 BUG_ON(ss->module && !module_refcount(ss->module)); 1094#endif 1095 } else { 1096 /* Subsystem state shouldn't exist */ 1097 BUG_ON(cgrp->subsys[i]); 1098 } 1099 } 1100 root->subsys_mask = root->actual_subsys_mask = final_subsys_mask; 1101 1102 return 0; 1103} 1104 1105static int cgroup_show_options(struct seq_file *seq, struct dentry *dentry) 1106{ 1107 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 1108 struct cgroup_subsys *ss; 1109 1110 mutex_lock(&cgroup_root_mutex); 1111 for_each_subsys(root, ss) 1112 seq_printf(seq, ",%s", ss->name); 1113 if (test_bit(ROOT_NOPREFIX, &root->flags)) 1114 seq_puts(seq, ",noprefix"); 1115 if (test_bit(ROOT_XATTR, &root->flags)) 1116 seq_puts(seq, ",xattr"); 1117 if (strlen(root->release_agent_path)) 1118 seq_printf(seq, ",release_agent=%s", root->release_agent_path); 1119 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags)) 1120 seq_puts(seq, ",clone_children"); 1121 if (strlen(root->name)) 1122 seq_printf(seq, ",name=%s", root->name); 1123 mutex_unlock(&cgroup_root_mutex); 1124 return 0; 1125} 1126 1127struct cgroup_sb_opts { 1128 unsigned long subsys_mask; 1129 unsigned long flags; 1130 char *release_agent; 1131 bool cpuset_clone_children; 1132 char *name; 1133 /* User explicitly requested empty subsystem */ 1134 bool none; 1135 1136 struct cgroupfs_root *new_root; 1137 1138}; 1139 1140/* 1141 * Convert a hierarchy specifier into a bitmask of subsystems and flags. Call 1142 * with cgroup_mutex held to protect the subsys[] array. This function takes 1143 * refcounts on subsystems to be used, unless it returns error, in which case 1144 * no refcounts are taken. 1145 */ 1146static int parse_cgroupfs_options(char *data, struct cgroup_sb_opts *opts) 1147{ 1148 char *token, *o = data; 1149 bool all_ss = false, one_ss = false; 1150 unsigned long mask = (unsigned long)-1; 1151 int i; 1152 bool module_pin_failed = false; 1153 1154 BUG_ON(!mutex_is_locked(&cgroup_mutex)); 1155 1156#ifdef CONFIG_CPUSETS 1157 mask = ~(1UL << cpuset_subsys_id); 1158#endif 1159 1160 memset(opts, 0, sizeof(*opts)); 1161 1162 while ((token = strsep(&o, ",")) != NULL) { 1163 if (!*token) 1164 return -EINVAL; 1165 if (!strcmp(token, "none")) { 1166 /* Explicitly have no subsystems */ 1167 opts->none = true; 1168 continue; 1169 } 1170 if (!strcmp(token, "all")) { 1171 /* Mutually exclusive option 'all' + subsystem name */ 1172 if (one_ss) 1173 return -EINVAL; 1174 all_ss = true; 1175 continue; 1176 } 1177 if (!strcmp(token, "noprefix")) { 1178 set_bit(ROOT_NOPREFIX, &opts->flags); 1179 continue; 1180 } 1181 if (!strcmp(token, "clone_children")) { 1182 opts->cpuset_clone_children = true; 1183 continue; 1184 } 1185 if (!strcmp(token, "xattr")) { 1186 set_bit(ROOT_XATTR, &opts->flags); 1187 continue; 1188 } 1189 if (!strncmp(token, "release_agent=", 14)) { 1190 /* Specifying two release agents is forbidden */ 1191 if (opts->release_agent) 1192 return -EINVAL; 1193 opts->release_agent = 1194 kstrndup(token + 14, PATH_MAX - 1, GFP_KERNEL); 1195 if (!opts->release_agent) 1196 return -ENOMEM; 1197 continue; 1198 } 1199 if (!strncmp(token, "name=", 5)) { 1200 const char *name = token + 5; 1201 /* Can't specify an empty name */ 1202 if (!strlen(name)) 1203 return -EINVAL; 1204 /* Must match [\w.-]+ */ 1205 for (i = 0; i < strlen(name); i++) { 1206 char c = name[i]; 1207 if (isalnum(c)) 1208 continue; 1209 if ((c == '.') || (c == '-') || (c == '_')) 1210 continue; 1211 return -EINVAL; 1212 } 1213 /* Specifying two names is forbidden */ 1214 if (opts->name) 1215 return -EINVAL; 1216 opts->name = kstrndup(name, 1217 MAX_CGROUP_ROOT_NAMELEN - 1, 1218 GFP_KERNEL); 1219 if (!opts->name) 1220 return -ENOMEM; 1221 1222 continue; 1223 } 1224 1225 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1226 struct cgroup_subsys *ss = subsys[i]; 1227 if (ss == NULL) 1228 continue; 1229 if (strcmp(token, ss->name)) 1230 continue; 1231 if (ss->disabled) 1232 continue; 1233 1234 /* Mutually exclusive option 'all' + subsystem name */ 1235 if (all_ss) 1236 return -EINVAL; 1237 set_bit(i, &opts->subsys_mask); 1238 one_ss = true; 1239 1240 break; 1241 } 1242 if (i == CGROUP_SUBSYS_COUNT) 1243 return -ENOENT; 1244 } 1245 1246 /* 1247 * If the 'all' option was specified select all the subsystems, 1248 * otherwise if 'none', 'name=' and a subsystem name options 1249 * were not specified, let's default to 'all' 1250 */ 1251 if (all_ss || (!one_ss && !opts->none && !opts->name)) { 1252 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1253 struct cgroup_subsys *ss = subsys[i]; 1254 if (ss == NULL) 1255 continue; 1256 if (ss->disabled) 1257 continue; 1258 set_bit(i, &opts->subsys_mask); 1259 } 1260 } 1261 1262 /* Consistency checks */ 1263 1264 /* 1265 * Option noprefix was introduced just for backward compatibility 1266 * with the old cpuset, so we allow noprefix only if mounting just 1267 * the cpuset subsystem. 1268 */ 1269 if (test_bit(ROOT_NOPREFIX, &opts->flags) && 1270 (opts->subsys_mask & mask)) 1271 return -EINVAL; 1272 1273 1274 /* Can't specify "none" and some subsystems */ 1275 if (opts->subsys_mask && opts->none) 1276 return -EINVAL; 1277 1278 /* 1279 * We either have to specify by name or by subsystems. (So all 1280 * empty hierarchies must have a name). 1281 */ 1282 if (!opts->subsys_mask && !opts->name) 1283 return -EINVAL; 1284 1285 /* 1286 * Grab references on all the modules we'll need, so the subsystems 1287 * don't dance around before rebind_subsystems attaches them. This may 1288 * take duplicate reference counts on a subsystem that's already used, 1289 * but rebind_subsystems handles this case. 1290 */ 1291 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1292 unsigned long bit = 1UL << i; 1293 1294 if (!(bit & opts->subsys_mask)) 1295 continue; 1296 if (!try_module_get(subsys[i]->module)) { 1297 module_pin_failed = true; 1298 break; 1299 } 1300 } 1301 if (module_pin_failed) { 1302 /* 1303 * oops, one of the modules was going away. this means that we 1304 * raced with a module_delete call, and to the user this is 1305 * essentially a "subsystem doesn't exist" case. 1306 */ 1307 for (i--; i >= 0; i--) { 1308 /* drop refcounts only on the ones we took */ 1309 unsigned long bit = 1UL << i; 1310 1311 if (!(bit & opts->subsys_mask)) 1312 continue; 1313 module_put(subsys[i]->module); 1314 } 1315 return -ENOENT; 1316 } 1317 1318 return 0; 1319} 1320 1321static void drop_parsed_module_refcounts(unsigned long subsys_mask) 1322{ 1323 int i; 1324 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 1325 unsigned long bit = 1UL << i; 1326 1327 if (!(bit & subsys_mask)) 1328 continue; 1329 module_put(subsys[i]->module); 1330 } 1331} 1332 1333static int cgroup_remount(struct super_block *sb, int *flags, char *data) 1334{ 1335 int ret = 0; 1336 struct cgroupfs_root *root = sb->s_fs_info; 1337 struct cgroup *cgrp = &root->top_cgroup; 1338 struct cgroup_sb_opts opts; 1339 unsigned long added_mask, removed_mask; 1340 1341 mutex_lock(&cgrp->dentry->d_inode->i_mutex); 1342 mutex_lock(&cgroup_mutex); 1343 mutex_lock(&cgroup_root_mutex); 1344 1345 /* See what subsystems are wanted */ 1346 ret = parse_cgroupfs_options(data, &opts); 1347 if (ret) 1348 goto out_unlock; 1349 1350 if (opts.subsys_mask != root->actual_subsys_mask || opts.release_agent) 1351 pr_warning("cgroup: option changes via remount are deprecated (pid=%d comm=%s)\n", 1352 task_tgid_nr(current), current->comm); 1353 1354 added_mask = opts.subsys_mask & ~root->subsys_mask; 1355 removed_mask = root->subsys_mask & ~opts.subsys_mask; 1356 1357 /* Don't allow flags or name to change at remount */ 1358 if (opts.flags != root->flags || 1359 (opts.name && strcmp(opts.name, root->name))) { 1360 ret = -EINVAL; 1361 drop_parsed_module_refcounts(opts.subsys_mask); 1362 goto out_unlock; 1363 } 1364 1365 /* 1366 * Clear out the files of subsystems that should be removed, do 1367 * this before rebind_subsystems, since rebind_subsystems may 1368 * change this hierarchy's subsys_list. 1369 */ 1370 cgroup_clear_directory(cgrp->dentry, false, removed_mask); 1371 1372 ret = rebind_subsystems(root, opts.subsys_mask); 1373 if (ret) { 1374 /* rebind_subsystems failed, re-populate the removed files */ 1375 cgroup_populate_dir(cgrp, false, removed_mask); 1376 drop_parsed_module_refcounts(opts.subsys_mask); 1377 goto out_unlock; 1378 } 1379 1380 /* re-populate subsystem files */ 1381 cgroup_populate_dir(cgrp, false, added_mask); 1382 1383 if (opts.release_agent) 1384 strcpy(root->release_agent_path, opts.release_agent); 1385 out_unlock: 1386 kfree(opts.release_agent); 1387 kfree(opts.name); 1388 mutex_unlock(&cgroup_root_mutex); 1389 mutex_unlock(&cgroup_mutex); 1390 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 1391 return ret; 1392} 1393 1394static const struct super_operations cgroup_ops = { 1395 .statfs = simple_statfs, 1396 .drop_inode = generic_delete_inode, 1397 .show_options = cgroup_show_options, 1398 .remount_fs = cgroup_remount, 1399}; 1400 1401static void init_cgroup_housekeeping(struct cgroup *cgrp) 1402{ 1403 INIT_LIST_HEAD(&cgrp->sibling); 1404 INIT_LIST_HEAD(&cgrp->children); 1405 INIT_LIST_HEAD(&cgrp->files); 1406 INIT_LIST_HEAD(&cgrp->css_sets); 1407 INIT_LIST_HEAD(&cgrp->allcg_node); 1408 INIT_LIST_HEAD(&cgrp->release_list); 1409 INIT_LIST_HEAD(&cgrp->pidlists); 1410 INIT_WORK(&cgrp->free_work, cgroup_free_fn); 1411 mutex_init(&cgrp->pidlist_mutex); 1412 INIT_LIST_HEAD(&cgrp->event_list); 1413 spin_lock_init(&cgrp->event_list_lock); 1414 simple_xattrs_init(&cgrp->xattrs); 1415} 1416 1417static void init_cgroup_root(struct cgroupfs_root *root) 1418{ 1419 struct cgroup *cgrp = &root->top_cgroup; 1420 1421 INIT_LIST_HEAD(&root->subsys_list); 1422 INIT_LIST_HEAD(&root->root_list); 1423 INIT_LIST_HEAD(&root->allcg_list); 1424 root->number_of_cgroups = 1; 1425 cgrp->root = root; 1426 cgrp->top_cgroup = cgrp; 1427 init_cgroup_housekeeping(cgrp); 1428 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 1429} 1430 1431static bool init_root_id(struct cgroupfs_root *root) 1432{ 1433 int ret = 0; 1434 1435 do { 1436 if (!ida_pre_get(&hierarchy_ida, GFP_KERNEL)) 1437 return false; 1438 spin_lock(&hierarchy_id_lock); 1439 /* Try to allocate the next unused ID */ 1440 ret = ida_get_new_above(&hierarchy_ida, next_hierarchy_id, 1441 &root->hierarchy_id); 1442 if (ret == -ENOSPC) 1443 /* Try again starting from 0 */ 1444 ret = ida_get_new(&hierarchy_ida, &root->hierarchy_id); 1445 if (!ret) { 1446 next_hierarchy_id = root->hierarchy_id + 1; 1447 } else if (ret != -EAGAIN) { 1448 /* Can only get here if the 31-bit IDR is full ... */ 1449 BUG_ON(ret); 1450 } 1451 spin_unlock(&hierarchy_id_lock); 1452 } while (ret); 1453 return true; 1454} 1455 1456static int cgroup_test_super(struct super_block *sb, void *data) 1457{ 1458 struct cgroup_sb_opts *opts = data; 1459 struct cgroupfs_root *root = sb->s_fs_info; 1460 1461 /* If we asked for a name then it must match */ 1462 if (opts->name && strcmp(opts->name, root->name)) 1463 return 0; 1464 1465 /* 1466 * If we asked for subsystems (or explicitly for no 1467 * subsystems) then they must match 1468 */ 1469 if ((opts->subsys_mask || opts->none) 1470 && (opts->subsys_mask != root->subsys_mask)) 1471 return 0; 1472 1473 return 1; 1474} 1475 1476static struct cgroupfs_root *cgroup_root_from_opts(struct cgroup_sb_opts *opts) 1477{ 1478 struct cgroupfs_root *root; 1479 1480 if (!opts->subsys_mask && !opts->none) 1481 return NULL; 1482 1483 root = kzalloc(sizeof(*root), GFP_KERNEL); 1484 if (!root) 1485 return ERR_PTR(-ENOMEM); 1486 1487 if (!init_root_id(root)) { 1488 kfree(root); 1489 return ERR_PTR(-ENOMEM); 1490 } 1491 init_cgroup_root(root); 1492 1493 root->subsys_mask = opts->subsys_mask; 1494 root->flags = opts->flags; 1495 ida_init(&root->cgroup_ida); 1496 if (opts->release_agent) 1497 strcpy(root->release_agent_path, opts->release_agent); 1498 if (opts->name) 1499 strcpy(root->name, opts->name); 1500 if (opts->cpuset_clone_children) 1501 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &root->top_cgroup.flags); 1502 return root; 1503} 1504 1505static void cgroup_drop_root(struct cgroupfs_root *root) 1506{ 1507 if (!root) 1508 return; 1509 1510 BUG_ON(!root->hierarchy_id); 1511 spin_lock(&hierarchy_id_lock); 1512 ida_remove(&hierarchy_ida, root->hierarchy_id); 1513 spin_unlock(&hierarchy_id_lock); 1514 ida_destroy(&root->cgroup_ida); 1515 kfree(root); 1516} 1517 1518static int cgroup_set_super(struct super_block *sb, void *data) 1519{ 1520 int ret; 1521 struct cgroup_sb_opts *opts = data; 1522 1523 /* If we don't have a new root, we can't set up a new sb */ 1524 if (!opts->new_root) 1525 return -EINVAL; 1526 1527 BUG_ON(!opts->subsys_mask && !opts->none); 1528 1529 ret = set_anon_super(sb, NULL); 1530 if (ret) 1531 return ret; 1532 1533 sb->s_fs_info = opts->new_root; 1534 opts->new_root->sb = sb; 1535 1536 sb->s_blocksize = PAGE_CACHE_SIZE; 1537 sb->s_blocksize_bits = PAGE_CACHE_SHIFT; 1538 sb->s_magic = CGROUP_SUPER_MAGIC; 1539 sb->s_op = &cgroup_ops; 1540 1541 return 0; 1542} 1543 1544static int cgroup_get_rootdir(struct super_block *sb) 1545{ 1546 static const struct dentry_operations cgroup_dops = { 1547 .d_iput = cgroup_diput, 1548 .d_delete = cgroup_delete, 1549 }; 1550 1551 struct inode *inode = 1552 cgroup_new_inode(S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR, sb); 1553 1554 if (!inode) 1555 return -ENOMEM; 1556 1557 inode->i_fop = &simple_dir_operations; 1558 inode->i_op = &cgroup_dir_inode_operations; 1559 /* directories start off with i_nlink == 2 (for "." entry) */ 1560 inc_nlink(inode); 1561 sb->s_root = d_make_root(inode); 1562 if (!sb->s_root) 1563 return -ENOMEM; 1564 /* for everything else we want ->d_op set */ 1565 sb->s_d_op = &cgroup_dops; 1566 return 0; 1567} 1568 1569static struct dentry *cgroup_mount(struct file_system_type *fs_type, 1570 int flags, const char *unused_dev_name, 1571 void *data) 1572{ 1573 struct cgroup_sb_opts opts; 1574 struct cgroupfs_root *root; 1575 int ret = 0; 1576 struct super_block *sb; 1577 struct cgroupfs_root *new_root; 1578 struct inode *inode; 1579 1580 /* First find the desired set of subsystems */ 1581 mutex_lock(&cgroup_mutex); 1582 ret = parse_cgroupfs_options(data, &opts); 1583 mutex_unlock(&cgroup_mutex); 1584 if (ret) 1585 goto out_err; 1586 1587 /* 1588 * Allocate a new cgroup root. We may not need it if we're 1589 * reusing an existing hierarchy. 1590 */ 1591 new_root = cgroup_root_from_opts(&opts); 1592 if (IS_ERR(new_root)) { 1593 ret = PTR_ERR(new_root); 1594 goto drop_modules; 1595 } 1596 opts.new_root = new_root; 1597 1598 /* Locate an existing or new sb for this hierarchy */ 1599 sb = sget(fs_type, cgroup_test_super, cgroup_set_super, 0, &opts); 1600 if (IS_ERR(sb)) { 1601 ret = PTR_ERR(sb); 1602 cgroup_drop_root(opts.new_root); 1603 goto drop_modules; 1604 } 1605 1606 root = sb->s_fs_info; 1607 BUG_ON(!root); 1608 if (root == opts.new_root) { 1609 /* We used the new root structure, so this is a new hierarchy */ 1610 struct list_head tmp_cg_links; 1611 struct cgroup *root_cgrp = &root->top_cgroup; 1612 struct cgroupfs_root *existing_root; 1613 const struct cred *cred; 1614 int i; 1615 struct css_set *cg; 1616 1617 BUG_ON(sb->s_root != NULL); 1618 1619 ret = cgroup_get_rootdir(sb); 1620 if (ret) 1621 goto drop_new_super; 1622 inode = sb->s_root->d_inode; 1623 1624 mutex_lock(&inode->i_mutex); 1625 mutex_lock(&cgroup_mutex); 1626 mutex_lock(&cgroup_root_mutex); 1627 1628 /* Check for name clashes with existing mounts */ 1629 ret = -EBUSY; 1630 if (strlen(root->name)) 1631 for_each_active_root(existing_root) 1632 if (!strcmp(existing_root->name, root->name)) 1633 goto unlock_drop; 1634 1635 /* 1636 * We're accessing css_set_count without locking 1637 * css_set_lock here, but that's OK - it can only be 1638 * increased by someone holding cgroup_lock, and 1639 * that's us. The worst that can happen is that we 1640 * have some link structures left over 1641 */ 1642 ret = allocate_cg_links(css_set_count, &tmp_cg_links); 1643 if (ret) 1644 goto unlock_drop; 1645 1646 ret = rebind_subsystems(root, root->subsys_mask); 1647 if (ret == -EBUSY) { 1648 free_cg_links(&tmp_cg_links); 1649 goto unlock_drop; 1650 } 1651 /* 1652 * There must be no failure case after here, since rebinding 1653 * takes care of subsystems' refcounts, which are explicitly 1654 * dropped in the failure exit path. 1655 */ 1656 1657 /* EBUSY should be the only error here */ 1658 BUG_ON(ret); 1659 1660 list_add(&root->root_list, &roots); 1661 root_count++; 1662 1663 sb->s_root->d_fsdata = root_cgrp; 1664 root->top_cgroup.dentry = sb->s_root; 1665 1666 /* Link the top cgroup in this hierarchy into all 1667 * the css_set objects */ 1668 write_lock(&css_set_lock); 1669 hash_for_each(css_set_table, i, cg, hlist) 1670 link_css_set(&tmp_cg_links, cg, root_cgrp); 1671 write_unlock(&css_set_lock); 1672 1673 free_cg_links(&tmp_cg_links); 1674 1675 BUG_ON(!list_empty(&root_cgrp->children)); 1676 BUG_ON(root->number_of_cgroups != 1); 1677 1678 cred = override_creds(&init_cred); 1679 cgroup_populate_dir(root_cgrp, true, root->subsys_mask); 1680 revert_creds(cred); 1681 mutex_unlock(&cgroup_root_mutex); 1682 mutex_unlock(&cgroup_mutex); 1683 mutex_unlock(&inode->i_mutex); 1684 } else { 1685 /* 1686 * We re-used an existing hierarchy - the new root (if 1687 * any) is not needed 1688 */ 1689 cgroup_drop_root(opts.new_root); 1690 /* no subsys rebinding, so refcounts don't change */ 1691 drop_parsed_module_refcounts(opts.subsys_mask); 1692 } 1693 1694 kfree(opts.release_agent); 1695 kfree(opts.name); 1696 return dget(sb->s_root); 1697 1698 unlock_drop: 1699 mutex_unlock(&cgroup_root_mutex); 1700 mutex_unlock(&cgroup_mutex); 1701 mutex_unlock(&inode->i_mutex); 1702 drop_new_super: 1703 deactivate_locked_super(sb); 1704 drop_modules: 1705 drop_parsed_module_refcounts(opts.subsys_mask); 1706 out_err: 1707 kfree(opts.release_agent); 1708 kfree(opts.name); 1709 return ERR_PTR(ret); 1710} 1711 1712static void cgroup_kill_sb(struct super_block *sb) { 1713 struct cgroupfs_root *root = sb->s_fs_info; 1714 struct cgroup *cgrp = &root->top_cgroup; 1715 int ret; 1716 struct cg_cgroup_link *link; 1717 struct cg_cgroup_link *saved_link; 1718 1719 BUG_ON(!root); 1720 1721 BUG_ON(root->number_of_cgroups != 1); 1722 BUG_ON(!list_empty(&cgrp->children)); 1723 1724 mutex_lock(&cgroup_mutex); 1725 mutex_lock(&cgroup_root_mutex); 1726 1727 /* Rebind all subsystems back to the default hierarchy */ 1728 ret = rebind_subsystems(root, 0); 1729 /* Shouldn't be able to fail ... */ 1730 BUG_ON(ret); 1731 1732 /* 1733 * Release all the links from css_sets to this hierarchy's 1734 * root cgroup 1735 */ 1736 write_lock(&css_set_lock); 1737 1738 list_for_each_entry_safe(link, saved_link, &cgrp->css_sets, 1739 cgrp_link_list) { 1740 list_del(&link->cg_link_list); 1741 list_del(&link->cgrp_link_list); 1742 kfree(link); 1743 } 1744 write_unlock(&css_set_lock); 1745 1746 if (!list_empty(&root->root_list)) { 1747 list_del(&root->root_list); 1748 root_count--; 1749 } 1750 1751 mutex_unlock(&cgroup_root_mutex); 1752 mutex_unlock(&cgroup_mutex); 1753 1754 simple_xattrs_free(&cgrp->xattrs); 1755 1756 kill_litter_super(sb); 1757 cgroup_drop_root(root); 1758} 1759 1760static struct file_system_type cgroup_fs_type = { 1761 .name = "cgroup", 1762 .mount = cgroup_mount, 1763 .kill_sb = cgroup_kill_sb, 1764}; 1765 1766static struct kobject *cgroup_kobj; 1767 1768/** 1769 * cgroup_path - generate the path of a cgroup 1770 * @cgrp: the cgroup in question 1771 * @buf: the buffer to write the path into 1772 * @buflen: the length of the buffer 1773 * 1774 * Called with cgroup_mutex held or else with an RCU-protected cgroup 1775 * reference. Writes path of cgroup into buf. Returns 0 on success, 1776 * -errno on error. 1777 */ 1778int cgroup_path(const struct cgroup *cgrp, char *buf, int buflen) 1779{ 1780 struct dentry *dentry = cgrp->dentry; 1781 char *start; 1782 1783 rcu_lockdep_assert(rcu_read_lock_held() || cgroup_lock_is_held(), 1784 "cgroup_path() called without proper locking"); 1785 1786 if (cgrp == dummytop) { 1787 /* 1788 * Inactive subsystems have no dentry for their root 1789 * cgroup 1790 */ 1791 strcpy(buf, "/"); 1792 return 0; 1793 } 1794 1795 start = buf + buflen - 1; 1796 1797 *start = '\0'; 1798 for (;;) { 1799 int len = dentry->d_name.len; 1800 1801 if ((start -= len) < buf) 1802 return -ENAMETOOLONG; 1803 memcpy(start, dentry->d_name.name, len); 1804 cgrp = cgrp->parent; 1805 if (!cgrp) 1806 break; 1807 1808 dentry = cgrp->dentry; 1809 if (!cgrp->parent) 1810 continue; 1811 if (--start < buf) 1812 return -ENAMETOOLONG; 1813 *start = '/'; 1814 } 1815 memmove(buf, start, buf + buflen - start); 1816 return 0; 1817} 1818EXPORT_SYMBOL_GPL(cgroup_path); 1819 1820/* 1821 * Control Group taskset 1822 */ 1823struct task_and_cgroup { 1824 struct task_struct *task; 1825 struct cgroup *cgrp; 1826 struct css_set *cg; 1827}; 1828 1829struct cgroup_taskset { 1830 struct task_and_cgroup single; 1831 struct flex_array *tc_array; 1832 int tc_array_len; 1833 int idx; 1834 struct cgroup *cur_cgrp; 1835}; 1836 1837/** 1838 * cgroup_taskset_first - reset taskset and return the first task 1839 * @tset: taskset of interest 1840 * 1841 * @tset iteration is initialized and the first task is returned. 1842 */ 1843struct task_struct *cgroup_taskset_first(struct cgroup_taskset *tset) 1844{ 1845 if (tset->tc_array) { 1846 tset->idx = 0; 1847 return cgroup_taskset_next(tset); 1848 } else { 1849 tset->cur_cgrp = tset->single.cgrp; 1850 return tset->single.task; 1851 } 1852} 1853EXPORT_SYMBOL_GPL(cgroup_taskset_first); 1854 1855/** 1856 * cgroup_taskset_next - iterate to the next task in taskset 1857 * @tset: taskset of interest 1858 * 1859 * Return the next task in @tset. Iteration must have been initialized 1860 * with cgroup_taskset_first(). 1861 */ 1862struct task_struct *cgroup_taskset_next(struct cgroup_taskset *tset) 1863{ 1864 struct task_and_cgroup *tc; 1865 1866 if (!tset->tc_array || tset->idx >= tset->tc_array_len) 1867 return NULL; 1868 1869 tc = flex_array_get(tset->tc_array, tset->idx++); 1870 tset->cur_cgrp = tc->cgrp; 1871 return tc->task; 1872} 1873EXPORT_SYMBOL_GPL(cgroup_taskset_next); 1874 1875/** 1876 * cgroup_taskset_cur_cgroup - return the matching cgroup for the current task 1877 * @tset: taskset of interest 1878 * 1879 * Return the cgroup for the current (last returned) task of @tset. This 1880 * function must be preceded by either cgroup_taskset_first() or 1881 * cgroup_taskset_next(). 1882 */ 1883struct cgroup *cgroup_taskset_cur_cgroup(struct cgroup_taskset *tset) 1884{ 1885 return tset->cur_cgrp; 1886} 1887EXPORT_SYMBOL_GPL(cgroup_taskset_cur_cgroup); 1888 1889/** 1890 * cgroup_taskset_size - return the number of tasks in taskset 1891 * @tset: taskset of interest 1892 */ 1893int cgroup_taskset_size(struct cgroup_taskset *tset) 1894{ 1895 return tset->tc_array ? tset->tc_array_len : 1; 1896} 1897EXPORT_SYMBOL_GPL(cgroup_taskset_size); 1898 1899 1900/* 1901 * cgroup_task_migrate - move a task from one cgroup to another. 1902 * 1903 * Must be called with cgroup_mutex and threadgroup locked. 1904 */ 1905static void cgroup_task_migrate(struct cgroup *cgrp, struct cgroup *oldcgrp, 1906 struct task_struct *tsk, struct css_set *newcg) 1907{ 1908 struct css_set *oldcg; 1909 1910 /* 1911 * We are synchronized through threadgroup_lock() against PF_EXITING 1912 * setting such that we can't race against cgroup_exit() changing the 1913 * css_set to init_css_set and dropping the old one. 1914 */ 1915 WARN_ON_ONCE(tsk->flags & PF_EXITING); 1916 oldcg = tsk->cgroups; 1917 1918 task_lock(tsk); 1919 rcu_assign_pointer(tsk->cgroups, newcg); 1920 task_unlock(tsk); 1921 1922 /* Update the css_set linked lists if we're using them */ 1923 write_lock(&css_set_lock); 1924 if (!list_empty(&tsk->cg_list)) 1925 list_move(&tsk->cg_list, &newcg->tasks); 1926 write_unlock(&css_set_lock); 1927 1928 /* 1929 * We just gained a reference on oldcg by taking it from the task. As 1930 * trading it for newcg is protected by cgroup_mutex, we're safe to drop 1931 * it here; it will be freed under RCU. 1932 */ 1933 set_bit(CGRP_RELEASABLE, &oldcgrp->flags); 1934 put_css_set(oldcg); 1935} 1936 1937/** 1938 * cgroup_attach_task - attach task 'tsk' to cgroup 'cgrp' 1939 * @cgrp: the cgroup the task is attaching to 1940 * @tsk: the task to be attached 1941 * 1942 * Call with cgroup_mutex and threadgroup locked. May take task_lock of 1943 * @tsk during call. 1944 */ 1945int cgroup_attach_task(struct cgroup *cgrp, struct task_struct *tsk) 1946{ 1947 int retval = 0; 1948 struct cgroup_subsys *ss, *failed_ss = NULL; 1949 struct cgroup *oldcgrp; 1950 struct cgroupfs_root *root = cgrp->root; 1951 struct cgroup_taskset tset = { }; 1952 struct css_set *newcg; 1953 1954 /* @tsk either already exited or can't exit until the end */ 1955 if (tsk->flags & PF_EXITING) 1956 return -ESRCH; 1957 1958 /* Nothing to do if the task is already in that cgroup */ 1959 oldcgrp = task_cgroup_from_root(tsk, root); 1960 if (cgrp == oldcgrp) 1961 return 0; 1962 1963 tset.single.task = tsk; 1964 tset.single.cgrp = oldcgrp; 1965 1966 for_each_subsys(root, ss) { 1967 if (ss->can_attach) { 1968 retval = ss->can_attach(cgrp, &tset); 1969 if (retval) { 1970 /* 1971 * Remember on which subsystem the can_attach() 1972 * failed, so that we only call cancel_attach() 1973 * against the subsystems whose can_attach() 1974 * succeeded. (See below) 1975 */ 1976 failed_ss = ss; 1977 goto out; 1978 } 1979 } 1980 } 1981 1982 newcg = find_css_set(tsk->cgroups, cgrp); 1983 if (!newcg) { 1984 retval = -ENOMEM; 1985 goto out; 1986 } 1987 1988 cgroup_task_migrate(cgrp, oldcgrp, tsk, newcg); 1989 1990 for_each_subsys(root, ss) { 1991 if (ss->attach) 1992 ss->attach(cgrp, &tset); 1993 } 1994 1995out: 1996 if (retval) { 1997 for_each_subsys(root, ss) { 1998 if (ss == failed_ss) 1999 /* 2000 * This subsystem was the one that failed the 2001 * can_attach() check earlier, so we don't need 2002 * to call cancel_attach() against it or any 2003 * remaining subsystems. 2004 */ 2005 break; 2006 if (ss->cancel_attach) 2007 ss->cancel_attach(cgrp, &tset); 2008 } 2009 } 2010 return retval; 2011} 2012 2013/** 2014 * cgroup_attach_task_all - attach task 'tsk' to all cgroups of task 'from' 2015 * @from: attach to all cgroups of a given task 2016 * @tsk: the task to be attached 2017 */ 2018int cgroup_attach_task_all(struct task_struct *from, struct task_struct *tsk) 2019{ 2020 struct cgroupfs_root *root; 2021 int retval = 0; 2022 2023 cgroup_lock(); 2024 for_each_active_root(root) { 2025 struct cgroup *from_cg = task_cgroup_from_root(from, root); 2026 2027 retval = cgroup_attach_task(from_cg, tsk); 2028 if (retval) 2029 break; 2030 } 2031 cgroup_unlock(); 2032 2033 return retval; 2034} 2035EXPORT_SYMBOL_GPL(cgroup_attach_task_all); 2036 2037/** 2038 * cgroup_attach_proc - attach all threads in a threadgroup to a cgroup 2039 * @cgrp: the cgroup to attach to 2040 * @leader: the threadgroup leader task_struct of the group to be attached 2041 * 2042 * Call holding cgroup_mutex and the group_rwsem of the leader. Will take 2043 * task_lock of each thread in leader's threadgroup individually in turn. 2044 */ 2045static int cgroup_attach_proc(struct cgroup *cgrp, struct task_struct *leader) 2046{ 2047 int retval, i, group_size; 2048 struct cgroup_subsys *ss, *failed_ss = NULL; 2049 /* guaranteed to be initialized later, but the compiler needs this */ 2050 struct cgroupfs_root *root = cgrp->root; 2051 /* threadgroup list cursor and array */ 2052 struct task_struct *tsk; 2053 struct task_and_cgroup *tc; 2054 struct flex_array *group; 2055 struct cgroup_taskset tset = { }; 2056 2057 /* 2058 * step 0: in order to do expensive, possibly blocking operations for 2059 * every thread, we cannot iterate the thread group list, since it needs 2060 * rcu or tasklist locked. instead, build an array of all threads in the 2061 * group - group_rwsem prevents new threads from appearing, and if 2062 * threads exit, this will just be an over-estimate. 2063 */ 2064 group_size = get_nr_threads(leader); 2065 /* flex_array supports very large thread-groups better than kmalloc. */ 2066 group = flex_array_alloc(sizeof(*tc), group_size, GFP_KERNEL); 2067 if (!group) 2068 return -ENOMEM; 2069 /* pre-allocate to guarantee space while iterating in rcu read-side. */ 2070 retval = flex_array_prealloc(group, 0, group_size, GFP_KERNEL); 2071 if (retval) 2072 goto out_free_group_list; 2073 2074 tsk = leader; 2075 i = 0; 2076 /* 2077 * Prevent freeing of tasks while we take a snapshot. Tasks that are 2078 * already PF_EXITING could be freed from underneath us unless we 2079 * take an rcu_read_lock. 2080 */ 2081 rcu_read_lock(); 2082 do { 2083 struct task_and_cgroup ent; 2084 2085 /* @tsk either already exited or can't exit until the end */ 2086 if (tsk->flags & PF_EXITING) 2087 continue; 2088 2089 /* as per above, nr_threads may decrease, but not increase. */ 2090 BUG_ON(i >= group_size); 2091 ent.task = tsk; 2092 ent.cgrp = task_cgroup_from_root(tsk, root); 2093 /* nothing to do if this task is already in the cgroup */ 2094 if (ent.cgrp == cgrp) 2095 continue; 2096 /* 2097 * saying GFP_ATOMIC has no effect here because we did prealloc 2098 * earlier, but it's good form to communicate our expectations. 2099 */ 2100 retval = flex_array_put(group, i, &ent, GFP_ATOMIC); 2101 BUG_ON(retval != 0); 2102 i++; 2103 } while_each_thread(leader, tsk); 2104 rcu_read_unlock(); 2105 /* remember the number of threads in the array for later. */ 2106 group_size = i; 2107 tset.tc_array = group; 2108 tset.tc_array_len = group_size; 2109 2110 /* methods shouldn't be called if no task is actually migrating */ 2111 retval = 0; 2112 if (!group_size) 2113 goto out_free_group_list; 2114 2115 /* 2116 * step 1: check that we can legitimately attach to the cgroup. 2117 */ 2118 for_each_subsys(root, ss) { 2119 if (ss->can_attach) { 2120 retval = ss->can_attach(cgrp, &tset); 2121 if (retval) { 2122 failed_ss = ss; 2123 goto out_cancel_attach; 2124 } 2125 } 2126 } 2127 2128 /* 2129 * step 2: make sure css_sets exist for all threads to be migrated. 2130 * we use find_css_set, which allocates a new one if necessary. 2131 */ 2132 for (i = 0; i < group_size; i++) { 2133 tc = flex_array_get(group, i); 2134 tc->cg = find_css_set(tc->task->cgroups, cgrp); 2135 if (!tc->cg) { 2136 retval = -ENOMEM; 2137 goto out_put_css_set_refs; 2138 } 2139 } 2140 2141 /* 2142 * step 3: now that we're guaranteed success wrt the css_sets, 2143 * proceed to move all tasks to the new cgroup. There are no 2144 * failure cases after here, so this is the commit point. 2145 */ 2146 for (i = 0; i < group_size; i++) { 2147 tc = flex_array_get(group, i); 2148 cgroup_task_migrate(cgrp, tc->cgrp, tc->task, tc->cg); 2149 } 2150 /* nothing is sensitive to fork() after this point. */ 2151 2152 /* 2153 * step 4: do subsystem attach callbacks. 2154 */ 2155 for_each_subsys(root, ss) { 2156 if (ss->attach) 2157 ss->attach(cgrp, &tset); 2158 } 2159 2160 /* 2161 * step 5: success! and cleanup 2162 */ 2163 retval = 0; 2164out_put_css_set_refs: 2165 if (retval) { 2166 for (i = 0; i < group_size; i++) { 2167 tc = flex_array_get(group, i); 2168 if (!tc->cg) 2169 break; 2170 put_css_set(tc->cg); 2171 } 2172 } 2173out_cancel_attach: 2174 if (retval) { 2175 for_each_subsys(root, ss) { 2176 if (ss == failed_ss) 2177 break; 2178 if (ss->cancel_attach) 2179 ss->cancel_attach(cgrp, &tset); 2180 } 2181 } 2182out_free_group_list: 2183 flex_array_free(group); 2184 return retval; 2185} 2186 2187/* 2188 * Find the task_struct of the task to attach by vpid and pass it along to the 2189 * function to attach either it or all tasks in its threadgroup. Will lock 2190 * cgroup_mutex and threadgroup; may take task_lock of task. 2191 */ 2192static int attach_task_by_pid(struct cgroup *cgrp, u64 pid, bool threadgroup) 2193{ 2194 struct task_struct *tsk; 2195 const struct cred *cred = current_cred(), *tcred; 2196 int ret; 2197 2198 if (!cgroup_lock_live_group(cgrp)) 2199 return -ENODEV; 2200 2201retry_find_task: 2202 rcu_read_lock(); 2203 if (pid) { 2204 tsk = find_task_by_vpid(pid); 2205 if (!tsk) { 2206 rcu_read_unlock(); 2207 ret= -ESRCH; 2208 goto out_unlock_cgroup; 2209 } 2210 /* 2211 * even if we're attaching all tasks in the thread group, we 2212 * only need to check permissions on one of them. 2213 */ 2214 tcred = __task_cred(tsk); 2215 if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && 2216 !uid_eq(cred->euid, tcred->uid) && 2217 !uid_eq(cred->euid, tcred->suid)) { 2218 rcu_read_unlock(); 2219 ret = -EACCES; 2220 goto out_unlock_cgroup; 2221 } 2222 } else 2223 tsk = current; 2224 2225 if (threadgroup) 2226 tsk = tsk->group_leader; 2227 2228 /* 2229 * Workqueue threads may acquire PF_THREAD_BOUND and become 2230 * trapped in a cpuset, or RT worker may be born in a cgroup 2231 * with no rt_runtime allocated. Just say no. 2232 */ 2233 if (tsk == kthreadd_task || (tsk->flags & PF_THREAD_BOUND)) { 2234 ret = -EINVAL; 2235 rcu_read_unlock(); 2236 goto out_unlock_cgroup; 2237 } 2238 2239 get_task_struct(tsk); 2240 rcu_read_unlock(); 2241 2242 threadgroup_lock(tsk); 2243 if (threadgroup) { 2244 if (!thread_group_leader(tsk)) { 2245 /* 2246 * a race with de_thread from another thread's exec() 2247 * may strip us of our leadership, if this happens, 2248 * there is no choice but to throw this task away and 2249 * try again; this is 2250 * "double-double-toil-and-trouble-check locking". 2251 */ 2252 threadgroup_unlock(tsk); 2253 put_task_struct(tsk); 2254 goto retry_find_task; 2255 } 2256 ret = cgroup_attach_proc(cgrp, tsk); 2257 } else 2258 ret = cgroup_attach_task(cgrp, tsk); 2259 threadgroup_unlock(tsk); 2260 2261 put_task_struct(tsk); 2262out_unlock_cgroup: 2263 cgroup_unlock(); 2264 return ret; 2265} 2266 2267static int cgroup_tasks_write(struct cgroup *cgrp, struct cftype *cft, u64 pid) 2268{ 2269 return attach_task_by_pid(cgrp, pid, false); 2270} 2271 2272static int cgroup_procs_write(struct cgroup *cgrp, struct cftype *cft, u64 tgid) 2273{ 2274 return attach_task_by_pid(cgrp, tgid, true); 2275} 2276 2277/** 2278 * cgroup_lock_live_group - take cgroup_mutex and check that cgrp is alive. 2279 * @cgrp: the cgroup to be checked for liveness 2280 * 2281 * On success, returns true; the lock should be later released with 2282 * cgroup_unlock(). On failure returns false with no lock held. 2283 */ 2284bool cgroup_lock_live_group(struct cgroup *cgrp) 2285{ 2286 mutex_lock(&cgroup_mutex); 2287 if (cgroup_is_removed(cgrp)) { 2288 mutex_unlock(&cgroup_mutex); 2289 return false; 2290 } 2291 return true; 2292} 2293EXPORT_SYMBOL_GPL(cgroup_lock_live_group); 2294 2295static int cgroup_release_agent_write(struct cgroup *cgrp, struct cftype *cft, 2296 const char *buffer) 2297{ 2298 BUILD_BUG_ON(sizeof(cgrp->root->release_agent_path) < PATH_MAX); 2299 if (strlen(buffer) >= PATH_MAX) 2300 return -EINVAL; 2301 if (!cgroup_lock_live_group(cgrp)) 2302 return -ENODEV; 2303 mutex_lock(&cgroup_root_mutex); 2304 strcpy(cgrp->root->release_agent_path, buffer); 2305 mutex_unlock(&cgroup_root_mutex); 2306 cgroup_unlock(); 2307 return 0; 2308} 2309 2310static int cgroup_release_agent_show(struct cgroup *cgrp, struct cftype *cft, 2311 struct seq_file *seq) 2312{ 2313 if (!cgroup_lock_live_group(cgrp)) 2314 return -ENODEV; 2315 seq_puts(seq, cgrp->root->release_agent_path); 2316 seq_putc(seq, '\n'); 2317 cgroup_unlock(); 2318 return 0; 2319} 2320 2321/* A buffer size big enough for numbers or short strings */ 2322#define CGROUP_LOCAL_BUFFER_SIZE 64 2323 2324static ssize_t cgroup_write_X64(struct cgroup *cgrp, struct cftype *cft, 2325 struct file *file, 2326 const char __user *userbuf, 2327 size_t nbytes, loff_t *unused_ppos) 2328{ 2329 char buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2330 int retval = 0; 2331 char *end; 2332 2333 if (!nbytes) 2334 return -EINVAL; 2335 if (nbytes >= sizeof(buffer)) 2336 return -E2BIG; 2337 if (copy_from_user(buffer, userbuf, nbytes)) 2338 return -EFAULT; 2339 2340 buffer[nbytes] = 0; /* nul-terminate */ 2341 if (cft->write_u64) { 2342 u64 val = simple_strtoull(strstrip(buffer), &end, 0); 2343 if (*end) 2344 return -EINVAL; 2345 retval = cft->write_u64(cgrp, cft, val); 2346 } else { 2347 s64 val = simple_strtoll(strstrip(buffer), &end, 0); 2348 if (*end) 2349 return -EINVAL; 2350 retval = cft->write_s64(cgrp, cft, val); 2351 } 2352 if (!retval) 2353 retval = nbytes; 2354 return retval; 2355} 2356 2357static ssize_t cgroup_write_string(struct cgroup *cgrp, struct cftype *cft, 2358 struct file *file, 2359 const char __user *userbuf, 2360 size_t nbytes, loff_t *unused_ppos) 2361{ 2362 char local_buffer[CGROUP_LOCAL_BUFFER_SIZE]; 2363 int retval = 0; 2364 size_t max_bytes = cft->max_write_len; 2365 char *buffer = local_buffer; 2366 2367 if (!max_bytes) 2368 max_bytes = sizeof(local_buffer) - 1; 2369 if (nbytes >= max_bytes) 2370 return -E2BIG; 2371 /* Allocate a dynamic buffer if we need one */ 2372 if (nbytes >= sizeof(local_buffer)) { 2373 buffer = kmalloc(nbytes + 1, GFP_KERNEL); 2374 if (buffer == NULL) 2375 return -ENOMEM; 2376 } 2377 if (nbytes && copy_from_user(buffer, userbuf, nbytes)) { 2378 retval = -EFAULT; 2379 goto out; 2380 } 2381 2382 buffer[nbytes] = 0; /* nul-terminate */ 2383 retval = cft->write_string(cgrp, cft, strstrip(buffer)); 2384 if (!retval) 2385 retval = nbytes; 2386out: 2387 if (buffer != local_buffer) 2388 kfree(buffer); 2389 return retval; 2390} 2391 2392static ssize_t cgroup_file_write(struct file *file, const char __user *buf, 2393 size_t nbytes, loff_t *ppos) 2394{ 2395 struct cftype *cft = __d_cft(file->f_dentry); 2396 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2397 2398 if (cgroup_is_removed(cgrp)) 2399 return -ENODEV; 2400 if (cft->write) 2401 return cft->write(cgrp, cft, file, buf, nbytes, ppos); 2402 if (cft->write_u64 || cft->write_s64) 2403 return cgroup_write_X64(cgrp, cft, file, buf, nbytes, ppos); 2404 if (cft->write_string) 2405 return cgroup_write_string(cgrp, cft, file, buf, nbytes, ppos); 2406 if (cft->trigger) { 2407 int ret = cft->trigger(cgrp, (unsigned int)cft->private); 2408 return ret ? ret : nbytes; 2409 } 2410 return -EINVAL; 2411} 2412 2413static ssize_t cgroup_read_u64(struct cgroup *cgrp, struct cftype *cft, 2414 struct file *file, 2415 char __user *buf, size_t nbytes, 2416 loff_t *ppos) 2417{ 2418 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2419 u64 val = cft->read_u64(cgrp, cft); 2420 int len = sprintf(tmp, "%llu\n", (unsigned long long) val); 2421 2422 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2423} 2424 2425static ssize_t cgroup_read_s64(struct cgroup *cgrp, struct cftype *cft, 2426 struct file *file, 2427 char __user *buf, size_t nbytes, 2428 loff_t *ppos) 2429{ 2430 char tmp[CGROUP_LOCAL_BUFFER_SIZE]; 2431 s64 val = cft->read_s64(cgrp, cft); 2432 int len = sprintf(tmp, "%lld\n", (long long) val); 2433 2434 return simple_read_from_buffer(buf, nbytes, ppos, tmp, len); 2435} 2436 2437static ssize_t cgroup_file_read(struct file *file, char __user *buf, 2438 size_t nbytes, loff_t *ppos) 2439{ 2440 struct cftype *cft = __d_cft(file->f_dentry); 2441 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 2442 2443 if (cgroup_is_removed(cgrp)) 2444 return -ENODEV; 2445 2446 if (cft->read) 2447 return cft->read(cgrp, cft, file, buf, nbytes, ppos); 2448 if (cft->read_u64) 2449 return cgroup_read_u64(cgrp, cft, file, buf, nbytes, ppos); 2450 if (cft->read_s64) 2451 return cgroup_read_s64(cgrp, cft, file, buf, nbytes, ppos); 2452 return -EINVAL; 2453} 2454 2455/* 2456 * seqfile ops/methods for returning structured data. Currently just 2457 * supports string->u64 maps, but can be extended in future. 2458 */ 2459 2460struct cgroup_seqfile_state { 2461 struct cftype *cft; 2462 struct cgroup *cgroup; 2463}; 2464 2465static int cgroup_map_add(struct cgroup_map_cb *cb, const char *key, u64 value) 2466{ 2467 struct seq_file *sf = cb->state; 2468 return seq_printf(sf, "%s %llu\n", key, (unsigned long long)value); 2469} 2470 2471static int cgroup_seqfile_show(struct seq_file *m, void *arg) 2472{ 2473 struct cgroup_seqfile_state *state = m->private; 2474 struct cftype *cft = state->cft; 2475 if (cft->read_map) { 2476 struct cgroup_map_cb cb = { 2477 .fill = cgroup_map_add, 2478 .state = m, 2479 }; 2480 return cft->read_map(state->cgroup, cft, &cb); 2481 } 2482 return cft->read_seq_string(state->cgroup, cft, m); 2483} 2484 2485static int cgroup_seqfile_release(struct inode *inode, struct file *file) 2486{ 2487 struct seq_file *seq = file->private_data; 2488 kfree(seq->private); 2489 return single_release(inode, file); 2490} 2491 2492static const struct file_operations cgroup_seqfile_operations = { 2493 .read = seq_read, 2494 .write = cgroup_file_write, 2495 .llseek = seq_lseek, 2496 .release = cgroup_seqfile_release, 2497}; 2498 2499static int cgroup_file_open(struct inode *inode, struct file *file) 2500{ 2501 int err; 2502 struct cftype *cft; 2503 2504 err = generic_file_open(inode, file); 2505 if (err) 2506 return err; 2507 cft = __d_cft(file->f_dentry); 2508 2509 if (cft->read_map || cft->read_seq_string) { 2510 struct cgroup_seqfile_state *state = 2511 kzalloc(sizeof(*state), GFP_USER); 2512 if (!state) 2513 return -ENOMEM; 2514 state->cft = cft; 2515 state->cgroup = __d_cgrp(file->f_dentry->d_parent); 2516 file->f_op = &cgroup_seqfile_operations; 2517 err = single_open(file, cgroup_seqfile_show, state); 2518 if (err < 0) 2519 kfree(state); 2520 } else if (cft->open) 2521 err = cft->open(inode, file); 2522 else 2523 err = 0; 2524 2525 return err; 2526} 2527 2528static int cgroup_file_release(struct inode *inode, struct file *file) 2529{ 2530 struct cftype *cft = __d_cft(file->f_dentry); 2531 if (cft->release) 2532 return cft->release(inode, file); 2533 return 0; 2534} 2535 2536/* 2537 * cgroup_rename - Only allow simple rename of directories in place. 2538 */ 2539static int cgroup_rename(struct inode *old_dir, struct dentry *old_dentry, 2540 struct inode *new_dir, struct dentry *new_dentry) 2541{ 2542 if (!S_ISDIR(old_dentry->d_inode->i_mode)) 2543 return -ENOTDIR; 2544 if (new_dentry->d_inode) 2545 return -EEXIST; 2546 if (old_dir != new_dir) 2547 return -EIO; 2548 return simple_rename(old_dir, old_dentry, new_dir, new_dentry); 2549} 2550 2551static struct simple_xattrs *__d_xattrs(struct dentry *dentry) 2552{ 2553 if (S_ISDIR(dentry->d_inode->i_mode)) 2554 return &__d_cgrp(dentry)->xattrs; 2555 else 2556 return &__d_cfe(dentry)->xattrs; 2557} 2558 2559static inline int xattr_enabled(struct dentry *dentry) 2560{ 2561 struct cgroupfs_root *root = dentry->d_sb->s_fs_info; 2562 return test_bit(ROOT_XATTR, &root->flags); 2563} 2564 2565static bool is_valid_xattr(const char *name) 2566{ 2567 if (!strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN) || 2568 !strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN)) 2569 return true; 2570 return false; 2571} 2572 2573static int cgroup_setxattr(struct dentry *dentry, const char *name, 2574 const void *val, size_t size, int flags) 2575{ 2576 if (!xattr_enabled(dentry)) 2577 return -EOPNOTSUPP; 2578 if (!is_valid_xattr(name)) 2579 return -EINVAL; 2580 return simple_xattr_set(__d_xattrs(dentry), name, val, size, flags); 2581} 2582 2583static int cgroup_removexattr(struct dentry *dentry, const char *name) 2584{ 2585 if (!xattr_enabled(dentry)) 2586 return -EOPNOTSUPP; 2587 if (!is_valid_xattr(name)) 2588 return -EINVAL; 2589 return simple_xattr_remove(__d_xattrs(dentry), name); 2590} 2591 2592static ssize_t cgroup_getxattr(struct dentry *dentry, const char *name, 2593 void *buf, size_t size) 2594{ 2595 if (!xattr_enabled(dentry)) 2596 return -EOPNOTSUPP; 2597 if (!is_valid_xattr(name)) 2598 return -EINVAL; 2599 return simple_xattr_get(__d_xattrs(dentry), name, buf, size); 2600} 2601 2602static ssize_t cgroup_listxattr(struct dentry *dentry, char *buf, size_t size) 2603{ 2604 if (!xattr_enabled(dentry)) 2605 return -EOPNOTSUPP; 2606 return simple_xattr_list(__d_xattrs(dentry), buf, size); 2607} 2608 2609static const struct file_operations cgroup_file_operations = { 2610 .read = cgroup_file_read, 2611 .write = cgroup_file_write, 2612 .llseek = generic_file_llseek, 2613 .open = cgroup_file_open, 2614 .release = cgroup_file_release, 2615}; 2616 2617static const struct inode_operations cgroup_file_inode_operations = { 2618 .setxattr = cgroup_setxattr, 2619 .getxattr = cgroup_getxattr, 2620 .listxattr = cgroup_listxattr, 2621 .removexattr = cgroup_removexattr, 2622}; 2623 2624static const struct inode_operations cgroup_dir_inode_operations = { 2625 .lookup = cgroup_lookup, 2626 .mkdir = cgroup_mkdir, 2627 .rmdir = cgroup_rmdir, 2628 .rename = cgroup_rename, 2629 .setxattr = cgroup_setxattr, 2630 .getxattr = cgroup_getxattr, 2631 .listxattr = cgroup_listxattr, 2632 .removexattr = cgroup_removexattr, 2633}; 2634 2635static struct dentry *cgroup_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) 2636{ 2637 if (dentry->d_name.len > NAME_MAX) 2638 return ERR_PTR(-ENAMETOOLONG); 2639 d_add(dentry, NULL); 2640 return NULL; 2641} 2642 2643/* 2644 * Check if a file is a control file 2645 */ 2646static inline struct cftype *__file_cft(struct file *file) 2647{ 2648 if (file_inode(file)->i_fop != &cgroup_file_operations) 2649 return ERR_PTR(-EINVAL); 2650 return __d_cft(file->f_dentry); 2651} 2652 2653static int cgroup_create_file(struct dentry *dentry, umode_t mode, 2654 struct super_block *sb) 2655{ 2656 struct inode *inode; 2657 2658 if (!dentry) 2659 return -ENOENT; 2660 if (dentry->d_inode) 2661 return -EEXIST; 2662 2663 inode = cgroup_new_inode(mode, sb); 2664 if (!inode) 2665 return -ENOMEM; 2666 2667 if (S_ISDIR(mode)) { 2668 inode->i_op = &cgroup_dir_inode_operations; 2669 inode->i_fop = &simple_dir_operations; 2670 2671 /* start off with i_nlink == 2 (for "." entry) */ 2672 inc_nlink(inode); 2673 inc_nlink(dentry->d_parent->d_inode); 2674 2675 /* 2676 * Control reaches here with cgroup_mutex held. 2677 * @inode->i_mutex should nest outside cgroup_mutex but we 2678 * want to populate it immediately without releasing 2679 * cgroup_mutex. As @inode isn't visible to anyone else 2680 * yet, trylock will always succeed without affecting 2681 * lockdep checks. 2682 */ 2683 WARN_ON_ONCE(!mutex_trylock(&inode->i_mutex)); 2684 } else if (S_ISREG(mode)) { 2685 inode->i_size = 0; 2686 inode->i_fop = &cgroup_file_operations; 2687 inode->i_op = &cgroup_file_inode_operations; 2688 } 2689 d_instantiate(dentry, inode); 2690 dget(dentry); /* Extra count - pin the dentry in core */ 2691 return 0; 2692} 2693 2694/** 2695 * cgroup_file_mode - deduce file mode of a control file 2696 * @cft: the control file in question 2697 * 2698 * returns cft->mode if ->mode is not 0 2699 * returns S_IRUGO|S_IWUSR if it has both a read and a write handler 2700 * returns S_IRUGO if it has only a read handler 2701 * returns S_IWUSR if it has only a write hander 2702 */ 2703static umode_t cgroup_file_mode(const struct cftype *cft) 2704{ 2705 umode_t mode = 0; 2706 2707 if (cft->mode) 2708 return cft->mode; 2709 2710 if (cft->read || cft->read_u64 || cft->read_s64 || 2711 cft->read_map || cft->read_seq_string) 2712 mode |= S_IRUGO; 2713 2714 if (cft->write || cft->write_u64 || cft->write_s64 || 2715 cft->write_string || cft->trigger) 2716 mode |= S_IWUSR; 2717 2718 return mode; 2719} 2720 2721static int cgroup_add_file(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2722 struct cftype *cft) 2723{ 2724 struct dentry *dir = cgrp->dentry; 2725 struct cgroup *parent = __d_cgrp(dir); 2726 struct dentry *dentry; 2727 struct cfent *cfe; 2728 int error; 2729 umode_t mode; 2730 char name[MAX_CGROUP_TYPE_NAMELEN + MAX_CFTYPE_NAME + 2] = { 0 }; 2731 2732 if (subsys && !test_bit(ROOT_NOPREFIX, &cgrp->root->flags)) { 2733 strcpy(name, subsys->name); 2734 strcat(name, "."); 2735 } 2736 strcat(name, cft->name); 2737 2738 BUG_ON(!mutex_is_locked(&dir->d_inode->i_mutex)); 2739 2740 cfe = kzalloc(sizeof(*cfe), GFP_KERNEL); 2741 if (!cfe) 2742 return -ENOMEM; 2743 2744 dentry = lookup_one_len(name, dir, strlen(name)); 2745 if (IS_ERR(dentry)) { 2746 error = PTR_ERR(dentry); 2747 goto out; 2748 } 2749 2750 mode = cgroup_file_mode(cft); 2751 error = cgroup_create_file(dentry, mode | S_IFREG, cgrp->root->sb); 2752 if (!error) { 2753 cfe->type = (void *)cft; 2754 cfe->dentry = dentry; 2755 dentry->d_fsdata = cfe; 2756 simple_xattrs_init(&cfe->xattrs); 2757 list_add_tail(&cfe->node, &parent->files); 2758 cfe = NULL; 2759 } 2760 dput(dentry); 2761out: 2762 kfree(cfe); 2763 return error; 2764} 2765 2766static int cgroup_addrm_files(struct cgroup *cgrp, struct cgroup_subsys *subsys, 2767 struct cftype cfts[], bool is_add) 2768{ 2769 struct cftype *cft; 2770 int err, ret = 0; 2771 2772 for (cft = cfts; cft->name[0] != '\0'; cft++) { 2773 /* does cft->flags tell us to skip this file on @cgrp? */ 2774 if ((cft->flags & CFTYPE_NOT_ON_ROOT) && !cgrp->parent) 2775 continue; 2776 if ((cft->flags & CFTYPE_ONLY_ON_ROOT) && cgrp->parent) 2777 continue; 2778 2779 if (is_add) { 2780 err = cgroup_add_file(cgrp, subsys, cft); 2781 if (err) 2782 pr_warn("cgroup_addrm_files: failed to add %s, err=%d\n", 2783 cft->name, err); 2784 ret = err; 2785 } else { 2786 cgroup_rm_file(cgrp, cft); 2787 } 2788 } 2789 return ret; 2790} 2791 2792static DEFINE_MUTEX(cgroup_cft_mutex); 2793 2794static void cgroup_cfts_prepare(void) 2795 __acquires(&cgroup_cft_mutex) __acquires(&cgroup_mutex) 2796{ 2797 /* 2798 * Thanks to the entanglement with vfs inode locking, we can't walk 2799 * the existing cgroups under cgroup_mutex and create files. 2800 * Instead, we increment reference on all cgroups and build list of 2801 * them using @cgrp->cft_q_node. Grab cgroup_cft_mutex to ensure 2802 * exclusive access to the field. 2803 */ 2804 mutex_lock(&cgroup_cft_mutex); 2805 mutex_lock(&cgroup_mutex); 2806} 2807 2808static void cgroup_cfts_commit(struct cgroup_subsys *ss, 2809 struct cftype *cfts, bool is_add) 2810 __releases(&cgroup_mutex) __releases(&cgroup_cft_mutex) 2811{ 2812 LIST_HEAD(pending); 2813 struct cgroup *cgrp, *n; 2814 2815 /* %NULL @cfts indicates abort and don't bother if @ss isn't attached */ 2816 if (cfts && ss->root != &rootnode) { 2817 list_for_each_entry(cgrp, &ss->root->allcg_list, allcg_node) { 2818 dget(cgrp->dentry); 2819 list_add_tail(&cgrp->cft_q_node, &pending); 2820 } 2821 } 2822 2823 mutex_unlock(&cgroup_mutex); 2824 2825 /* 2826 * All new cgroups will see @cfts update on @ss->cftsets. Add/rm 2827 * files for all cgroups which were created before. 2828 */ 2829 list_for_each_entry_safe(cgrp, n, &pending, cft_q_node) { 2830 struct inode *inode = cgrp->dentry->d_inode; 2831 2832 mutex_lock(&inode->i_mutex); 2833 mutex_lock(&cgroup_mutex); 2834 if (!cgroup_is_removed(cgrp)) 2835 cgroup_addrm_files(cgrp, ss, cfts, is_add); 2836 mutex_unlock(&cgroup_mutex); 2837 mutex_unlock(&inode->i_mutex); 2838 2839 list_del_init(&cgrp->cft_q_node); 2840 dput(cgrp->dentry); 2841 } 2842 2843 mutex_unlock(&cgroup_cft_mutex); 2844} 2845 2846/** 2847 * cgroup_add_cftypes - add an array of cftypes to a subsystem 2848 * @ss: target cgroup subsystem 2849 * @cfts: zero-length name terminated array of cftypes 2850 * 2851 * Register @cfts to @ss. Files described by @cfts are created for all 2852 * existing cgroups to which @ss is attached and all future cgroups will 2853 * have them too. This function can be called anytime whether @ss is 2854 * attached or not. 2855 * 2856 * Returns 0 on successful registration, -errno on failure. Note that this 2857 * function currently returns 0 as long as @cfts registration is successful 2858 * even if some file creation attempts on existing cgroups fail. 2859 */ 2860int cgroup_add_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2861{ 2862 struct cftype_set *set; 2863 2864 set = kzalloc(sizeof(*set), GFP_KERNEL); 2865 if (!set) 2866 return -ENOMEM; 2867 2868 cgroup_cfts_prepare(); 2869 set->cfts = cfts; 2870 list_add_tail(&set->node, &ss->cftsets); 2871 cgroup_cfts_commit(ss, cfts, true); 2872 2873 return 0; 2874} 2875EXPORT_SYMBOL_GPL(cgroup_add_cftypes); 2876 2877/** 2878 * cgroup_rm_cftypes - remove an array of cftypes from a subsystem 2879 * @ss: target cgroup subsystem 2880 * @cfts: zero-length name terminated array of cftypes 2881 * 2882 * Unregister @cfts from @ss. Files described by @cfts are removed from 2883 * all existing cgroups to which @ss is attached and all future cgroups 2884 * won't have them either. This function can be called anytime whether @ss 2885 * is attached or not. 2886 * 2887 * Returns 0 on successful unregistration, -ENOENT if @cfts is not 2888 * registered with @ss. 2889 */ 2890int cgroup_rm_cftypes(struct cgroup_subsys *ss, struct cftype *cfts) 2891{ 2892 struct cftype_set *set; 2893 2894 cgroup_cfts_prepare(); 2895 2896 list_for_each_entry(set, &ss->cftsets, node) { 2897 if (set->cfts == cfts) { 2898 list_del_init(&set->node); 2899 cgroup_cfts_commit(ss, cfts, false); 2900 return 0; 2901 } 2902 } 2903 2904 cgroup_cfts_commit(ss, NULL, false); 2905 return -ENOENT; 2906} 2907 2908/** 2909 * cgroup_task_count - count the number of tasks in a cgroup. 2910 * @cgrp: the cgroup in question 2911 * 2912 * Return the number of tasks in the cgroup. 2913 */ 2914int cgroup_task_count(const struct cgroup *cgrp) 2915{ 2916 int count = 0; 2917 struct cg_cgroup_link *link; 2918 2919 read_lock(&css_set_lock); 2920 list_for_each_entry(link, &cgrp->css_sets, cgrp_link_list) { 2921 count += atomic_read(&link->cg->refcount); 2922 } 2923 read_unlock(&css_set_lock); 2924 return count; 2925} 2926 2927/* 2928 * Advance a list_head iterator. The iterator should be positioned at 2929 * the start of a css_set 2930 */ 2931static void cgroup_advance_iter(struct cgroup *cgrp, 2932 struct cgroup_iter *it) 2933{ 2934 struct list_head *l = it->cg_link; 2935 struct cg_cgroup_link *link; 2936 struct css_set *cg; 2937 2938 /* Advance to the next non-empty css_set */ 2939 do { 2940 l = l->next; 2941 if (l == &cgrp->css_sets) { 2942 it->cg_link = NULL; 2943 return; 2944 } 2945 link = list_entry(l, struct cg_cgroup_link, cgrp_link_list); 2946 cg = link->cg; 2947 } while (list_empty(&cg->tasks)); 2948 it->cg_link = l; 2949 it->task = cg->tasks.next; 2950} 2951 2952/* 2953 * To reduce the fork() overhead for systems that are not actually 2954 * using their cgroups capability, we don't maintain the lists running 2955 * through each css_set to its tasks until we see the list actually 2956 * used - in other words after the first call to cgroup_iter_start(). 2957 */ 2958static void cgroup_enable_task_cg_lists(void) 2959{ 2960 struct task_struct *p, *g; 2961 write_lock(&css_set_lock); 2962 use_task_css_set_links = 1; 2963 /* 2964 * We need tasklist_lock because RCU is not safe against 2965 * while_each_thread(). Besides, a forking task that has passed 2966 * cgroup_post_fork() without seeing use_task_css_set_links = 1 2967 * is not guaranteed to have its child immediately visible in the 2968 * tasklist if we walk through it with RCU. 2969 */ 2970 read_lock(&tasklist_lock); 2971 do_each_thread(g, p) { 2972 task_lock(p); 2973 /* 2974 * We should check if the process is exiting, otherwise 2975 * it will race with cgroup_exit() in that the list 2976 * entry won't be deleted though the process has exited. 2977 */ 2978 if (!(p->flags & PF_EXITING) && list_empty(&p->cg_list)) 2979 list_add(&p->cg_list, &p->cgroups->tasks); 2980 task_unlock(p); 2981 } while_each_thread(g, p); 2982 read_unlock(&tasklist_lock); 2983 write_unlock(&css_set_lock); 2984} 2985 2986/** 2987 * cgroup_next_descendant_pre - find the next descendant for pre-order walk 2988 * @pos: the current position (%NULL to initiate traversal) 2989 * @cgroup: cgroup whose descendants to walk 2990 * 2991 * To be used by cgroup_for_each_descendant_pre(). Find the next 2992 * descendant to visit for pre-order traversal of @cgroup's descendants. 2993 */ 2994struct cgroup *cgroup_next_descendant_pre(struct cgroup *pos, 2995 struct cgroup *cgroup) 2996{ 2997 struct cgroup *next; 2998 2999 WARN_ON_ONCE(!rcu_read_lock_held()); 3000 3001 /* if first iteration, pretend we just visited @cgroup */ 3002 if (!pos) { 3003 if (list_empty(&cgroup->children)) 3004 return NULL; 3005 pos = cgroup; 3006 } 3007 3008 /* visit the first child if exists */ 3009 next = list_first_or_null_rcu(&pos->children, struct cgroup, sibling); 3010 if (next) 3011 return next; 3012 3013 /* no child, visit my or the closest ancestor's next sibling */ 3014 do { 3015 next = list_entry_rcu(pos->sibling.next, struct cgroup, 3016 sibling); 3017 if (&next->sibling != &pos->parent->children) 3018 return next; 3019 3020 pos = pos->parent; 3021 } while (pos != cgroup); 3022 3023 return NULL; 3024} 3025EXPORT_SYMBOL_GPL(cgroup_next_descendant_pre); 3026 3027/** 3028 * cgroup_rightmost_descendant - return the rightmost descendant of a cgroup 3029 * @pos: cgroup of interest 3030 * 3031 * Return the rightmost descendant of @pos. If there's no descendant, 3032 * @pos is returned. This can be used during pre-order traversal to skip 3033 * subtree of @pos. 3034 */ 3035struct cgroup *cgroup_rightmost_descendant(struct cgroup *pos) 3036{ 3037 struct cgroup *last, *tmp; 3038 3039 WARN_ON_ONCE(!rcu_read_lock_held()); 3040 3041 do { 3042 last = pos; 3043 /* ->prev isn't RCU safe, walk ->next till the end */ 3044 pos = NULL; 3045 list_for_each_entry_rcu(tmp, &last->children, sibling) 3046 pos = tmp; 3047 } while (pos); 3048 3049 return last; 3050} 3051EXPORT_SYMBOL_GPL(cgroup_rightmost_descendant); 3052 3053static struct cgroup *cgroup_leftmost_descendant(struct cgroup *pos) 3054{ 3055 struct cgroup *last; 3056 3057 do { 3058 last = pos; 3059 pos = list_first_or_null_rcu(&pos->children, struct cgroup, 3060 sibling); 3061 } while (pos); 3062 3063 return last; 3064} 3065 3066/** 3067 * cgroup_next_descendant_post - find the next descendant for post-order walk 3068 * @pos: the current position (%NULL to initiate traversal) 3069 * @cgroup: cgroup whose descendants to walk 3070 * 3071 * To be used by cgroup_for_each_descendant_post(). Find the next 3072 * descendant to visit for post-order traversal of @cgroup's descendants. 3073 */ 3074struct cgroup *cgroup_next_descendant_post(struct cgroup *pos, 3075 struct cgroup *cgroup) 3076{ 3077 struct cgroup *next; 3078 3079 WARN_ON_ONCE(!rcu_read_lock_held()); 3080 3081 /* if first iteration, visit the leftmost descendant */ 3082 if (!pos) { 3083 next = cgroup_leftmost_descendant(cgroup); 3084 return next != cgroup ? next : NULL; 3085 } 3086 3087 /* if there's an unvisited sibling, visit its leftmost descendant */ 3088 next = list_entry_rcu(pos->sibling.next, struct cgroup, sibling); 3089 if (&next->sibling != &pos->parent->children) 3090 return cgroup_leftmost_descendant(next); 3091 3092 /* no sibling left, visit parent */ 3093 next = pos->parent; 3094 return next != cgroup ? next : NULL; 3095} 3096EXPORT_SYMBOL_GPL(cgroup_next_descendant_post); 3097 3098void cgroup_iter_start(struct cgroup *cgrp, struct cgroup_iter *it) 3099 __acquires(css_set_lock) 3100{ 3101 /* 3102 * The first time anyone tries to iterate across a cgroup, 3103 * we need to enable the list linking each css_set to its 3104 * tasks, and fix up all existing tasks. 3105 */ 3106 if (!use_task_css_set_links) 3107 cgroup_enable_task_cg_lists(); 3108 3109 read_lock(&css_set_lock); 3110 it->cg_link = &cgrp->css_sets; 3111 cgroup_advance_iter(cgrp, it); 3112} 3113 3114struct task_struct *cgroup_iter_next(struct cgroup *cgrp, 3115 struct cgroup_iter *it) 3116{ 3117 struct task_struct *res; 3118 struct list_head *l = it->task; 3119 struct cg_cgroup_link *link; 3120 3121 /* If the iterator cg is NULL, we have no tasks */ 3122 if (!it->cg_link) 3123 return NULL; 3124 res = list_entry(l, struct task_struct, cg_list); 3125 /* Advance iterator to find next entry */ 3126 l = l->next; 3127 link = list_entry(it->cg_link, struct cg_cgroup_link, cgrp_link_list); 3128 if (l == &link->cg->tasks) { 3129 /* We reached the end of this task list - move on to 3130 * the next cg_cgroup_link */ 3131 cgroup_advance_iter(cgrp, it); 3132 } else { 3133 it->task = l; 3134 } 3135 return res; 3136} 3137 3138void cgroup_iter_end(struct cgroup *cgrp, struct cgroup_iter *it) 3139 __releases(css_set_lock) 3140{ 3141 read_unlock(&css_set_lock); 3142} 3143 3144static inline int started_after_time(struct task_struct *t1, 3145 struct timespec *time, 3146 struct task_struct *t2) 3147{ 3148 int start_diff = timespec_compare(&t1->start_time, time); 3149 if (start_diff > 0) { 3150 return 1; 3151 } else if (start_diff < 0) { 3152 return 0; 3153 } else { 3154 /* 3155 * Arbitrarily, if two processes started at the same 3156 * time, we'll say that the lower pointer value 3157 * started first. Note that t2 may have exited by now 3158 * so this may not be a valid pointer any longer, but 3159 * that's fine - it still serves to distinguish 3160 * between two tasks started (effectively) simultaneously. 3161 */ 3162 return t1 > t2; 3163 } 3164} 3165 3166/* 3167 * This function is a callback from heap_insert() and is used to order 3168 * the heap. 3169 * In this case we order the heap in descending task start time. 3170 */ 3171static inline int started_after(void *p1, void *p2) 3172{ 3173 struct task_struct *t1 = p1; 3174 struct task_struct *t2 = p2; 3175 return started_after_time(t1, &t2->start_time, t2); 3176} 3177 3178/** 3179 * cgroup_scan_tasks - iterate though all the tasks in a cgroup 3180 * @scan: struct cgroup_scanner containing arguments for the scan 3181 * 3182 * Arguments include pointers to callback functions test_task() and 3183 * process_task(). 3184 * Iterate through all the tasks in a cgroup, calling test_task() for each, 3185 * and if it returns true, call process_task() for it also. 3186 * The test_task pointer may be NULL, meaning always true (select all tasks). 3187 * Effectively duplicates cgroup_iter_{start,next,end}() 3188 * but does not lock css_set_lock for the call to process_task(). 3189 * The struct cgroup_scanner may be embedded in any structure of the caller's 3190 * creation. 3191 * It is guaranteed that process_task() will act on every task that 3192 * is a member of the cgroup for the duration of this call. This 3193 * function may or may not call process_task() for tasks that exit 3194 * or move to a different cgroup during the call, or are forked or 3195 * move into the cgroup during the call. 3196 * 3197 * Note that test_task() may be called with locks held, and may in some 3198 * situations be called multiple times for the same task, so it should 3199 * be cheap. 3200 * If the heap pointer in the struct cgroup_scanner is non-NULL, a heap has been 3201 * pre-allocated and will be used for heap operations (and its "gt" member will 3202 * be overwritten), else a temporary heap will be used (allocation of which 3203 * may cause this function to fail). 3204 */ 3205int cgroup_scan_tasks(struct cgroup_scanner *scan) 3206{ 3207 int retval, i; 3208 struct cgroup_iter it; 3209 struct task_struct *p, *dropped; 3210 /* Never dereference latest_task, since it's not refcounted */ 3211 struct task_struct *latest_task = NULL; 3212 struct ptr_heap tmp_heap; 3213 struct ptr_heap *heap; 3214 struct timespec latest_time = { 0, 0 }; 3215 3216 if (scan->heap) { 3217 /* The caller supplied our heap and pre-allocated its memory */ 3218 heap = scan->heap; 3219 heap->gt = &started_after; 3220 } else { 3221 /* We need to allocate our own heap memory */ 3222 heap = &tmp_heap; 3223 retval = heap_init(heap, PAGE_SIZE, GFP_KERNEL, &started_after); 3224 if (retval) 3225 /* cannot allocate the heap */ 3226 return retval; 3227 } 3228 3229 again: 3230 /* 3231 * Scan tasks in the cgroup, using the scanner's "test_task" callback 3232 * to determine which are of interest, and using the scanner's 3233 * "process_task" callback to process any of them that need an update. 3234 * Since we don't want to hold any locks during the task updates, 3235 * gather tasks to be processed in a heap structure. 3236 * The heap is sorted by descending task start time. 3237 * If the statically-sized heap fills up, we overflow tasks that 3238 * started later, and in future iterations only consider tasks that 3239 * started after the latest task in the previous pass. This 3240 * guarantees forward progress and that we don't miss any tasks. 3241 */ 3242 heap->size = 0; 3243 cgroup_iter_start(scan->cg, &it); 3244 while ((p = cgroup_iter_next(scan->cg, &it))) { 3245 /* 3246 * Only affect tasks that qualify per the caller's callback, 3247 * if he provided one 3248 */ 3249 if (scan->test_task && !scan->test_task(p, scan)) 3250 continue; 3251 /* 3252 * Only process tasks that started after the last task 3253 * we processed 3254 */ 3255 if (!started_after_time(p, &latest_time, latest_task)) 3256 continue; 3257 dropped = heap_insert(heap, p); 3258 if (dropped == NULL) { 3259 /* 3260 * The new task was inserted; the heap wasn't 3261 * previously full 3262 */ 3263 get_task_struct(p); 3264 } else if (dropped != p) { 3265 /* 3266 * The new task was inserted, and pushed out a 3267 * different task 3268 */ 3269 get_task_struct(p); 3270 put_task_struct(dropped); 3271 } 3272 /* 3273 * Else the new task was newer than anything already in 3274 * the heap and wasn't inserted 3275 */ 3276 } 3277 cgroup_iter_end(scan->cg, &it); 3278 3279 if (heap->size) { 3280 for (i = 0; i < heap->size; i++) { 3281 struct task_struct *q = heap->ptrs[i]; 3282 if (i == 0) { 3283 latest_time = q->start_time; 3284 latest_task = q; 3285 } 3286 /* Process the task per the caller's callback */ 3287 scan->process_task(q, scan); 3288 put_task_struct(q); 3289 } 3290 /* 3291 * If we had to process any tasks at all, scan again 3292 * in case some of them were in the middle of forking 3293 * children that didn't get processed. 3294 * Not the most efficient way to do it, but it avoids 3295 * having to take callback_mutex in the fork path 3296 */ 3297 goto again; 3298 } 3299 if (heap == &tmp_heap) 3300 heap_free(&tmp_heap); 3301 return 0; 3302} 3303 3304/* 3305 * Stuff for reading the 'tasks'/'procs' files. 3306 * 3307 * Reading this file can return large amounts of data if a cgroup has 3308 * *lots* of attached tasks. So it may need several calls to read(), 3309 * but we cannot guarantee that the information we produce is correct 3310 * unless we produce it entirely atomically. 3311 * 3312 */ 3313 3314/* which pidlist file are we talking about? */ 3315enum cgroup_filetype { 3316 CGROUP_FILE_PROCS, 3317 CGROUP_FILE_TASKS, 3318}; 3319 3320/* 3321 * A pidlist is a list of pids that virtually represents the contents of one 3322 * of the cgroup files ("procs" or "tasks"). We keep a list of such pidlists, 3323 * a pair (one each for procs, tasks) for each pid namespace that's relevant 3324 * to the cgroup. 3325 */ 3326struct cgroup_pidlist { 3327 /* 3328 * used to find which pidlist is wanted. doesn't change as long as 3329 * this particular list stays in the list. 3330 */ 3331 struct { enum cgroup_filetype type; struct pid_namespace *ns; } key; 3332 /* array of xids */ 3333 pid_t *list; 3334 /* how many elements the above list has */ 3335 int length; 3336 /* how many files are using the current array */ 3337 int use_count; 3338 /* each of these stored in a list by its cgroup */ 3339 struct list_head links; 3340 /* pointer to the cgroup we belong to, for list removal purposes */ 3341 struct cgroup *owner; 3342 /* protects the other fields */ 3343 struct rw_semaphore mutex; 3344}; 3345 3346/* 3347 * The following two functions "fix" the issue where there are more pids 3348 * than kmalloc will give memory for; in such cases, we use vmalloc/vfree. 3349 * TODO: replace with a kernel-wide solution to this problem 3350 */ 3351#define PIDLIST_TOO_LARGE(c) ((c) * sizeof(pid_t) > (PAGE_SIZE * 2)) 3352static void *pidlist_allocate(int count) 3353{ 3354 if (PIDLIST_TOO_LARGE(count)) 3355 return vmalloc(count * sizeof(pid_t)); 3356 else 3357 return kmalloc(count * sizeof(pid_t), GFP_KERNEL); 3358} 3359static void pidlist_free(void *p) 3360{ 3361 if (is_vmalloc_addr(p)) 3362 vfree(p); 3363 else 3364 kfree(p); 3365} 3366static void *pidlist_resize(void *p, int newcount) 3367{ 3368 void *newlist; 3369 /* note: if new alloc fails, old p will still be valid either way */ 3370 if (is_vmalloc_addr(p)) { 3371 newlist = vmalloc(newcount * sizeof(pid_t)); 3372 if (!newlist) 3373 return NULL; 3374 memcpy(newlist, p, newcount * sizeof(pid_t)); 3375 vfree(p); 3376 } else { 3377 newlist = krealloc(p, newcount * sizeof(pid_t), GFP_KERNEL); 3378 } 3379 return newlist; 3380} 3381 3382/* 3383 * pidlist_uniq - given a kmalloc()ed list, strip out all duplicate entries 3384 * If the new stripped list is sufficiently smaller and there's enough memory 3385 * to allocate a new buffer, will let go of the unneeded memory. Returns the 3386 * number of unique elements. 3387 */ 3388/* is the size difference enough that we should re-allocate the array? */ 3389#define PIDLIST_REALLOC_DIFFERENCE(old, new) ((old) - PAGE_SIZE >= (new)) 3390static int pidlist_uniq(pid_t **p, int length) 3391{ 3392 int src, dest = 1; 3393 pid_t *list = *p; 3394 pid_t *newlist; 3395 3396 /* 3397 * we presume the 0th element is unique, so i starts at 1. trivial 3398 * edge cases first; no work needs to be done for either 3399 */ 3400 if (length == 0 || length == 1) 3401 return length; 3402 /* src and dest walk down the list; dest counts unique elements */ 3403 for (src = 1; src < length; src++) { 3404 /* find next unique element */ 3405 while (list[src] == list[src-1]) { 3406 src++; 3407 if (src == length) 3408 goto after; 3409 } 3410 /* dest always points to where the next unique element goes */ 3411 list[dest] = list[src]; 3412 dest++; 3413 } 3414after: 3415 /* 3416 * if the length difference is large enough, we want to allocate a 3417 * smaller buffer to save memory. if this fails due to out of memory, 3418 * we'll just stay with what we've got. 3419 */ 3420 if (PIDLIST_REALLOC_DIFFERENCE(length, dest)) { 3421 newlist = pidlist_resize(list, dest); 3422 if (newlist) 3423 *p = newlist; 3424 } 3425 return dest; 3426} 3427 3428static int cmppid(const void *a, const void *b) 3429{ 3430 return *(pid_t *)a - *(pid_t *)b; 3431} 3432 3433/* 3434 * find the appropriate pidlist for our purpose (given procs vs tasks) 3435 * returns with the lock on that pidlist already held, and takes care 3436 * of the use count, or returns NULL with no locks held if we're out of 3437 * memory. 3438 */ 3439static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, 3440 enum cgroup_filetype type) 3441{ 3442 struct cgroup_pidlist *l; 3443 /* don't need task_nsproxy() if we're looking at ourself */ 3444 struct pid_namespace *ns = task_active_pid_ns(current); 3445 3446 /* 3447 * We can't drop the pidlist_mutex before taking the l->mutex in case 3448 * the last ref-holder is trying to remove l from the list at the same 3449 * time. Holding the pidlist_mutex precludes somebody taking whichever 3450 * list we find out from under us - compare release_pid_array(). 3451 */ 3452 mutex_lock(&cgrp->pidlist_mutex); 3453 list_for_each_entry(l, &cgrp->pidlists, links) { 3454 if (l->key.type == type && l->key.ns == ns) { 3455 /* make sure l doesn't vanish out from under us */ 3456 down_write(&l->mutex); 3457 mutex_unlock(&cgrp->pidlist_mutex); 3458 return l; 3459 } 3460 } 3461 /* entry not found; create a new one */ 3462 l = kmalloc(sizeof(struct cgroup_pidlist), GFP_KERNEL); 3463 if (!l) { 3464 mutex_unlock(&cgrp->pidlist_mutex); 3465 return l; 3466 } 3467 init_rwsem(&l->mutex); 3468 down_write(&l->mutex); 3469 l->key.type = type; 3470 l->key.ns = get_pid_ns(ns); 3471 l->use_count = 0; /* don't increment here */ 3472 l->list = NULL; 3473 l->owner = cgrp; 3474 list_add(&l->links, &cgrp->pidlists); 3475 mutex_unlock(&cgrp->pidlist_mutex); 3476 return l; 3477} 3478 3479/* 3480 * Load a cgroup's pidarray with either procs' tgids or tasks' pids 3481 */ 3482static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, 3483 struct cgroup_pidlist **lp) 3484{ 3485 pid_t *array; 3486 int length; 3487 int pid, n = 0; /* used for populating the array */ 3488 struct cgroup_iter it; 3489 struct task_struct *tsk; 3490 struct cgroup_pidlist *l; 3491 3492 /* 3493 * If cgroup gets more users after we read count, we won't have 3494 * enough space - tough. This race is indistinguishable to the 3495 * caller from the case that the additional cgroup users didn't 3496 * show up until sometime later on. 3497 */ 3498 length = cgroup_task_count(cgrp); 3499 array = pidlist_allocate(length); 3500 if (!array) 3501 return -ENOMEM; 3502 /* now, populate the array */ 3503 cgroup_iter_start(cgrp, &it); 3504 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3505 if (unlikely(n == length)) 3506 break; 3507 /* get tgid or pid for procs or tasks file respectively */ 3508 if (type == CGROUP_FILE_PROCS) 3509 pid = task_tgid_vnr(tsk); 3510 else 3511 pid = task_pid_vnr(tsk); 3512 if (pid > 0) /* make sure to only use valid results */ 3513 array[n++] = pid; 3514 } 3515 cgroup_iter_end(cgrp, &it); 3516 length = n; 3517 /* now sort & (if procs) strip out duplicates */ 3518 sort(array, length, sizeof(pid_t), cmppid, NULL); 3519 if (type == CGROUP_FILE_PROCS) 3520 length = pidlist_uniq(&array, length); 3521 l = cgroup_pidlist_find(cgrp, type); 3522 if (!l) { 3523 pidlist_free(array); 3524 return -ENOMEM; 3525 } 3526 /* store array, freeing old if necessary - lock already held */ 3527 pidlist_free(l->list); 3528 l->list = array; 3529 l->length = length; 3530 l->use_count++; 3531 up_write(&l->mutex); 3532 *lp = l; 3533 return 0; 3534} 3535 3536/** 3537 * cgroupstats_build - build and fill cgroupstats 3538 * @stats: cgroupstats to fill information into 3539 * @dentry: A dentry entry belonging to the cgroup for which stats have 3540 * been requested. 3541 * 3542 * Build and fill cgroupstats so that taskstats can export it to user 3543 * space. 3544 */ 3545int cgroupstats_build(struct cgroupstats *stats, struct dentry *dentry) 3546{ 3547 int ret = -EINVAL; 3548 struct cgroup *cgrp; 3549 struct cgroup_iter it; 3550 struct task_struct *tsk; 3551 3552 /* 3553 * Validate dentry by checking the superblock operations, 3554 * and make sure it's a directory. 3555 */ 3556 if (dentry->d_sb->s_op != &cgroup_ops || 3557 !S_ISDIR(dentry->d_inode->i_mode)) 3558 goto err; 3559 3560 ret = 0; 3561 cgrp = dentry->d_fsdata; 3562 3563 cgroup_iter_start(cgrp, &it); 3564 while ((tsk = cgroup_iter_next(cgrp, &it))) { 3565 switch (tsk->state) { 3566 case TASK_RUNNING: 3567 stats->nr_running++; 3568 break; 3569 case TASK_INTERRUPTIBLE: 3570 stats->nr_sleeping++; 3571 break; 3572 case TASK_UNINTERRUPTIBLE: 3573 stats->nr_uninterruptible++; 3574 break; 3575 case TASK_STOPPED: 3576 stats->nr_stopped++; 3577 break; 3578 default: 3579 if (delayacct_is_task_waiting_on_io(tsk)) 3580 stats->nr_io_wait++; 3581 break; 3582 } 3583 } 3584 cgroup_iter_end(cgrp, &it); 3585 3586err: 3587 return ret; 3588} 3589 3590 3591/* 3592 * seq_file methods for the tasks/procs files. The seq_file position is the 3593 * next pid to display; the seq_file iterator is a pointer to the pid 3594 * in the cgroup->l->list array. 3595 */ 3596 3597static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) 3598{ 3599 /* 3600 * Initially we receive a position value that corresponds to 3601 * one more than the last pid shown (or 0 on the first call or 3602 * after a seek to the start). Use a binary-search to find the 3603 * next pid to display, if any 3604 */ 3605 struct cgroup_pidlist *l = s->private; 3606 int index = 0, pid = *pos; 3607 int *iter; 3608 3609 down_read(&l->mutex); 3610 if (pid) { 3611 int end = l->length; 3612 3613 while (index < end) { 3614 int mid = (index + end) / 2; 3615 if (l->list[mid] == pid) { 3616 index = mid; 3617 break; 3618 } else if (l->list[mid] <= pid) 3619 index = mid + 1; 3620 else 3621 end = mid; 3622 } 3623 } 3624 /* If we're off the end of the array, we're done */ 3625 if (index >= l->length) 3626 return NULL; 3627 /* Update the abstract position to be the actual pid that we found */ 3628 iter = l->list + index; 3629 *pos = *iter; 3630 return iter; 3631} 3632 3633static void cgroup_pidlist_stop(struct seq_file *s, void *v) 3634{ 3635 struct cgroup_pidlist *l = s->private; 3636 up_read(&l->mutex); 3637} 3638 3639static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) 3640{ 3641 struct cgroup_pidlist *l = s->private; 3642 pid_t *p = v; 3643 pid_t *end = l->list + l->length; 3644 /* 3645 * Advance to the next pid in the array. If this goes off the 3646 * end, we're done 3647 */ 3648 p++; 3649 if (p >= end) { 3650 return NULL; 3651 } else { 3652 *pos = *p; 3653 return p; 3654 } 3655} 3656 3657static int cgroup_pidlist_show(struct seq_file *s, void *v) 3658{ 3659 return seq_printf(s, "%d\n", *(int *)v); 3660} 3661 3662/* 3663 * seq_operations functions for iterating on pidlists through seq_file - 3664 * independent of whether it's tasks or procs 3665 */ 3666static const struct seq_operations cgroup_pidlist_seq_operations = { 3667 .start = cgroup_pidlist_start, 3668 .stop = cgroup_pidlist_stop, 3669 .next = cgroup_pidlist_next, 3670 .show = cgroup_pidlist_show, 3671}; 3672 3673static void cgroup_release_pid_array(struct cgroup_pidlist *l) 3674{ 3675 /* 3676 * the case where we're the last user of this particular pidlist will 3677 * have us remove it from the cgroup's list, which entails taking the 3678 * mutex. since in pidlist_find the pidlist->lock depends on cgroup-> 3679 * pidlist_mutex, we have to take pidlist_mutex first. 3680 */ 3681 mutex_lock(&l->owner->pidlist_mutex); 3682 down_write(&l->mutex); 3683 BUG_ON(!l->use_count); 3684 if (!--l->use_count) { 3685 /* we're the last user if refcount is 0; remove and free */ 3686 list_del(&l->links); 3687 mutex_unlock(&l->owner->pidlist_mutex); 3688 pidlist_free(l->list); 3689 put_pid_ns(l->key.ns); 3690 up_write(&l->mutex); 3691 kfree(l); 3692 return; 3693 } 3694 mutex_unlock(&l->owner->pidlist_mutex); 3695 up_write(&l->mutex); 3696} 3697 3698static int cgroup_pidlist_release(struct inode *inode, struct file *file) 3699{ 3700 struct cgroup_pidlist *l; 3701 if (!(file->f_mode & FMODE_READ)) 3702 return 0; 3703 /* 3704 * the seq_file will only be initialized if the file was opened for 3705 * reading; hence we check if it's not null only in that case. 3706 */ 3707 l = ((struct seq_file *)file->private_data)->private; 3708 cgroup_release_pid_array(l); 3709 return seq_release(inode, file); 3710} 3711 3712static const struct file_operations cgroup_pidlist_operations = { 3713 .read = seq_read, 3714 .llseek = seq_lseek, 3715 .write = cgroup_file_write, 3716 .release = cgroup_pidlist_release, 3717}; 3718 3719/* 3720 * The following functions handle opens on a file that displays a pidlist 3721 * (tasks or procs). Prepare an array of the process/thread IDs of whoever's 3722 * in the cgroup. 3723 */ 3724/* helper function for the two below it */ 3725static int cgroup_pidlist_open(struct file *file, enum cgroup_filetype type) 3726{ 3727 struct cgroup *cgrp = __d_cgrp(file->f_dentry->d_parent); 3728 struct cgroup_pidlist *l; 3729 int retval; 3730 3731 /* Nothing to do for write-only files */ 3732 if (!(file->f_mode & FMODE_READ)) 3733 return 0; 3734 3735 /* have the array populated */ 3736 retval = pidlist_array_load(cgrp, type, &l); 3737 if (retval) 3738 return retval; 3739 /* configure file information */ 3740 file->f_op = &cgroup_pidlist_operations; 3741 3742 retval = seq_open(file, &cgroup_pidlist_seq_operations); 3743 if (retval) { 3744 cgroup_release_pid_array(l); 3745 return retval; 3746 } 3747 ((struct seq_file *)file->private_data)->private = l; 3748 return 0; 3749} 3750static int cgroup_tasks_open(struct inode *unused, struct file *file) 3751{ 3752 return cgroup_pidlist_open(file, CGROUP_FILE_TASKS); 3753} 3754static int cgroup_procs_open(struct inode *unused, struct file *file) 3755{ 3756 return cgroup_pidlist_open(file, CGROUP_FILE_PROCS); 3757} 3758 3759static u64 cgroup_read_notify_on_release(struct cgroup *cgrp, 3760 struct cftype *cft) 3761{ 3762 return notify_on_release(cgrp); 3763} 3764 3765static int cgroup_write_notify_on_release(struct cgroup *cgrp, 3766 struct cftype *cft, 3767 u64 val) 3768{ 3769 clear_bit(CGRP_RELEASABLE, &cgrp->flags); 3770 if (val) 3771 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3772 else 3773 clear_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 3774 return 0; 3775} 3776 3777/* 3778 * Unregister event and free resources. 3779 * 3780 * Gets called from workqueue. 3781 */ 3782static void cgroup_event_remove(struct work_struct *work) 3783{ 3784 struct cgroup_event *event = container_of(work, struct cgroup_event, 3785 remove); 3786 struct cgroup *cgrp = event->cgrp; 3787 3788 remove_wait_queue(event->wqh, &event->wait); 3789 3790 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3791 3792 /* Notify userspace the event is going away. */ 3793 eventfd_signal(event->eventfd, 1); 3794 3795 eventfd_ctx_put(event->eventfd); 3796 kfree(event); 3797 dput(cgrp->dentry); 3798} 3799 3800/* 3801 * Gets called on POLLHUP on eventfd when user closes it. 3802 * 3803 * Called with wqh->lock held and interrupts disabled. 3804 */ 3805static int cgroup_event_wake(wait_queue_t *wait, unsigned mode, 3806 int sync, void *key) 3807{ 3808 struct cgroup_event *event = container_of(wait, 3809 struct cgroup_event, wait); 3810 struct cgroup *cgrp = event->cgrp; 3811 unsigned long flags = (unsigned long)key; 3812 3813 if (flags & POLLHUP) { 3814 /* 3815 * If the event has been detached at cgroup removal, we 3816 * can simply return knowing the other side will cleanup 3817 * for us. 3818 * 3819 * We can't race against event freeing since the other 3820 * side will require wqh->lock via remove_wait_queue(), 3821 * which we hold. 3822 */ 3823 spin_lock(&cgrp->event_list_lock); 3824 if (!list_empty(&event->list)) { 3825 list_del_init(&event->list); 3826 /* 3827 * We are in atomic context, but cgroup_event_remove() 3828 * may sleep, so we have to call it in workqueue. 3829 */ 3830 schedule_work(&event->remove); 3831 } 3832 spin_unlock(&cgrp->event_list_lock); 3833 } 3834 3835 return 0; 3836} 3837 3838static void cgroup_event_ptable_queue_proc(struct file *file, 3839 wait_queue_head_t *wqh, poll_table *pt) 3840{ 3841 struct cgroup_event *event = container_of(pt, 3842 struct cgroup_event, pt); 3843 3844 event->wqh = wqh; 3845 add_wait_queue(wqh, &event->wait); 3846} 3847 3848/* 3849 * Parse input and register new cgroup event handler. 3850 * 3851 * Input must be in format ' '. 3852 * Interpretation of args is defined by control file implementation. 3853 */ 3854static int cgroup_write_event_control(struct cgroup *cgrp, struct cftype *cft, 3855 const char *buffer) 3856{ 3857 struct cgroup_event *event = NULL; 3858 struct cgroup *cgrp_cfile; 3859 unsigned int efd, cfd; 3860 struct file *efile = NULL; 3861 struct file *cfile = NULL; 3862 char *endp; 3863 int ret; 3864 3865 efd = simple_strtoul(buffer, &endp, 10); 3866 if (*endp != ' ') 3867 return -EINVAL; 3868 buffer = endp + 1; 3869 3870 cfd = simple_strtoul(buffer, &endp, 10); 3871 if ((*endp != ' ') && (*endp != '\0')) 3872 return -EINVAL; 3873 buffer = endp + 1; 3874 3875 event = kzalloc(sizeof(*event), GFP_KERNEL); 3876 if (!event) 3877 return -ENOMEM; 3878 event->cgrp = cgrp; 3879 INIT_LIST_HEAD(&event->list); 3880 init_poll_funcptr(&event->pt, cgroup_event_ptable_queue_proc); 3881 init_waitqueue_func_entry(&event->wait, cgroup_event_wake); 3882 INIT_WORK(&event->remove, cgroup_event_remove); 3883 3884 efile = eventfd_fget(efd); 3885 if (IS_ERR(efile)) { 3886 ret = PTR_ERR(efile); 3887 goto fail; 3888 } 3889 3890 event->eventfd = eventfd_ctx_fileget(efile); 3891 if (IS_ERR(event->eventfd)) { 3892 ret = PTR_ERR(event->eventfd); 3893 goto fail; 3894 } 3895 3896 cfile = fget(cfd); 3897 if (!cfile) { 3898 ret = -EBADF; 3899 goto fail; 3900 } 3901 3902 /* the process need read permission on control file */ 3903 /* AV: shouldn't we check that it's been opened for read instead? */ 3904 ret = inode_permission(file_inode(cfile), MAY_READ); 3905 if (ret < 0) 3906 goto fail; 3907 3908 event->cft = __file_cft(cfile); 3909 if (IS_ERR(event->cft)) { 3910 ret = PTR_ERR(event->cft); 3911 goto fail; 3912 } 3913 3914 /* 3915 * The file to be monitored must be in the same cgroup as 3916 * cgroup.event_control is. 3917 */ 3918 cgrp_cfile = __d_cgrp(cfile->f_dentry->d_parent); 3919 if (cgrp_cfile != cgrp) { 3920 ret = -EINVAL; 3921 goto fail; 3922 } 3923 3924 if (!event->cft->register_event || !event->cft->unregister_event) { 3925 ret = -EINVAL; 3926 goto fail; 3927 } 3928 3929 ret = event->cft->register_event(cgrp, event->cft, 3930 event->eventfd, buffer); 3931 if (ret) 3932 goto fail; 3933 3934 if (efile->f_op->poll(efile, &event->pt) & POLLHUP) { 3935 event->cft->unregister_event(cgrp, event->cft, event->eventfd); 3936 ret = 0; 3937 goto fail; 3938 } 3939 3940 /* 3941 * Events should be removed after rmdir of cgroup directory, but before 3942 * destroying subsystem state objects. Let's take reference to cgroup 3943 * directory dentry to do that. 3944 */ 3945 dget(cgrp->dentry); 3946 3947 spin_lock(&cgrp->event_list_lock); 3948 list_add(&event->list, &cgrp->event_list); 3949 spin_unlock(&cgrp->event_list_lock); 3950 3951 fput(cfile); 3952 fput(efile); 3953 3954 return 0; 3955 3956fail: 3957 if (cfile) 3958 fput(cfile); 3959 3960 if (event && event->eventfd && !IS_ERR(event->eventfd)) 3961 eventfd_ctx_put(event->eventfd); 3962 3963 if (!IS_ERR_OR_NULL(efile)) 3964 fput(efile); 3965 3966 kfree(event); 3967 3968 return ret; 3969} 3970 3971static u64 cgroup_clone_children_read(struct cgroup *cgrp, 3972 struct cftype *cft) 3973{ 3974 return test_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3975} 3976 3977static int cgroup_clone_children_write(struct cgroup *cgrp, 3978 struct cftype *cft, 3979 u64 val) 3980{ 3981 if (val) 3982 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3983 else 3984 clear_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 3985 return 0; 3986} 3987 3988/* 3989 * for the common functions, 'private' gives the type of file 3990 */ 3991/* for hysterical raisins, we can't put this on the older files */ 3992#define CGROUP_FILE_GENERIC_PREFIX "cgroup." 3993static struct cftype files[] = { 3994 { 3995 .name = "tasks", 3996 .open = cgroup_tasks_open, 3997 .write_u64 = cgroup_tasks_write, 3998 .release = cgroup_pidlist_release, 3999 .mode = S_IRUGO | S_IWUSR, 4000 }, 4001 { 4002 .name = CGROUP_FILE_GENERIC_PREFIX "procs", 4003 .open = cgroup_procs_open, 4004 .write_u64 = cgroup_procs_write, 4005 .release = cgroup_pidlist_release, 4006 .mode = S_IRUGO | S_IWUSR, 4007 }, 4008 { 4009 .name = "notify_on_release", 4010 .read_u64 = cgroup_read_notify_on_release, 4011 .write_u64 = cgroup_write_notify_on_release, 4012 }, 4013 { 4014 .name = CGROUP_FILE_GENERIC_PREFIX "event_control", 4015 .write_string = cgroup_write_event_control, 4016 .mode = S_IWUGO, 4017 }, 4018 { 4019 .name = "cgroup.clone_children", 4020 .read_u64 = cgroup_clone_children_read, 4021 .write_u64 = cgroup_clone_children_write, 4022 }, 4023 { 4024 .name = "release_agent", 4025 .flags = CFTYPE_ONLY_ON_ROOT, 4026 .read_seq_string = cgroup_release_agent_show, 4027 .write_string = cgroup_release_agent_write, 4028 .max_write_len = PATH_MAX, 4029 }, 4030 { } /* terminate */ 4031}; 4032 4033/** 4034 * cgroup_populate_dir - selectively creation of files in a directory 4035 * @cgrp: target cgroup 4036 * @base_files: true if the base files should be added 4037 * @subsys_mask: mask of the subsystem ids whose files should be added 4038 */ 4039static int cgroup_populate_dir(struct cgroup *cgrp, bool base_files, 4040 unsigned long subsys_mask) 4041{ 4042 int err; 4043 struct cgroup_subsys *ss; 4044 4045 if (base_files) { 4046 err = cgroup_addrm_files(cgrp, NULL, files, true); 4047 if (err < 0) 4048 return err; 4049 } 4050 4051 /* process cftsets of each subsystem */ 4052 for_each_subsys(cgrp->root, ss) { 4053 struct cftype_set *set; 4054 if (!test_bit(ss->subsys_id, &subsys_mask)) 4055 continue; 4056 4057 list_for_each_entry(set, &ss->cftsets, node) 4058 cgroup_addrm_files(cgrp, ss, set->cfts, true); 4059 } 4060 4061 /* This cgroup is ready now */ 4062 for_each_subsys(cgrp->root, ss) { 4063 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4064 /* 4065 * Update id->css pointer and make this css visible from 4066 * CSS ID functions. This pointer will be dereferened 4067 * from RCU-read-side without locks. 4068 */ 4069 if (css->id) 4070 rcu_assign_pointer(css->id->css, css); 4071 } 4072 4073 return 0; 4074} 4075 4076static void css_dput_fn(struct work_struct *work) 4077{ 4078 struct cgroup_subsys_state *css = 4079 container_of(work, struct cgroup_subsys_state, dput_work); 4080 struct dentry *dentry = css->cgroup->dentry; 4081 struct super_block *sb = dentry->d_sb; 4082 4083 atomic_inc(&sb->s_active); 4084 dput(dentry); 4085 deactivate_super(sb); 4086} 4087 4088static void init_cgroup_css(struct cgroup_subsys_state *css, 4089 struct cgroup_subsys *ss, 4090 struct cgroup *cgrp) 4091{ 4092 css->cgroup = cgrp; 4093 atomic_set(&css->refcnt, 1); 4094 css->flags = 0; 4095 css->id = NULL; 4096 if (cgrp == dummytop) 4097 css->flags |= CSS_ROOT; 4098 BUG_ON(cgrp->subsys[ss->subsys_id]); 4099 cgrp->subsys[ss->subsys_id] = css; 4100 4101 /* 4102 * css holds an extra ref to @cgrp->dentry which is put on the last 4103 * css_put(). dput() requires process context, which css_put() may 4104 * be called without. @css->dput_work will be used to invoke 4105 * dput() asynchronously from css_put(). 4106 */ 4107 INIT_WORK(&css->dput_work, css_dput_fn); 4108} 4109 4110/* invoke ->post_create() on a new CSS and mark it online if successful */ 4111static int online_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4112{ 4113 int ret = 0; 4114 4115 lockdep_assert_held(&cgroup_mutex); 4116 4117 if (ss->css_online) 4118 ret = ss->css_online(cgrp); 4119 if (!ret) 4120 cgrp->subsys[ss->subsys_id]->flags |= CSS_ONLINE; 4121 return ret; 4122} 4123 4124/* if the CSS is online, invoke ->pre_destory() on it and mark it offline */ 4125static void offline_css(struct cgroup_subsys *ss, struct cgroup *cgrp) 4126 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4127{ 4128 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4129 4130 lockdep_assert_held(&cgroup_mutex); 4131 4132 if (!(css->flags & CSS_ONLINE)) 4133 return; 4134 4135 /* 4136 * css_offline() should be called with cgroup_mutex unlocked. See 4137 * 3fa59dfbc3 ("cgroup: fix potential deadlock in pre_destroy") for 4138 * details. This temporary unlocking should go away once 4139 * cgroup_mutex is unexported from controllers. 4140 */ 4141 if (ss->css_offline) { 4142 mutex_unlock(&cgroup_mutex); 4143 ss->css_offline(cgrp); 4144 mutex_lock(&cgroup_mutex); 4145 } 4146 4147 cgrp->subsys[ss->subsys_id]->flags &= ~CSS_ONLINE; 4148} 4149 4150/* 4151 * cgroup_create - create a cgroup 4152 * @parent: cgroup that will be parent of the new cgroup 4153 * @dentry: dentry of the new cgroup 4154 * @mode: mode to set on new inode 4155 * 4156 * Must be called with the mutex on the parent inode held 4157 */ 4158static long cgroup_create(struct cgroup *parent, struct dentry *dentry, 4159 umode_t mode) 4160{ 4161 struct cgroup *cgrp; 4162 struct cgroupfs_root *root = parent->root; 4163 int err = 0; 4164 struct cgroup_subsys *ss; 4165 struct super_block *sb = root->sb; 4166 4167 /* allocate the cgroup and its ID, 0 is reserved for the root */ 4168 cgrp = kzalloc(sizeof(*cgrp), GFP_KERNEL); 4169 if (!cgrp) 4170 return -ENOMEM; 4171 4172 cgrp->id = ida_simple_get(&root->cgroup_ida, 1, 0, GFP_KERNEL); 4173 if (cgrp->id < 0) 4174 goto err_free_cgrp; 4175 4176 /* 4177 * Only live parents can have children. Note that the liveliness 4178 * check isn't strictly necessary because cgroup_mkdir() and 4179 * cgroup_rmdir() are fully synchronized by i_mutex; however, do it 4180 * anyway so that locking is contained inside cgroup proper and we 4181 * don't get nasty surprises if we ever grow another caller. 4182 */ 4183 if (!cgroup_lock_live_group(parent)) { 4184 err = -ENODEV; 4185 goto err_free_id; 4186 } 4187 4188 /* Grab a reference on the superblock so the hierarchy doesn't 4189 * get deleted on unmount if there are child cgroups. This 4190 * can be done outside cgroup_mutex, since the sb can't 4191 * disappear while someone has an open control file on the 4192 * fs */ 4193 atomic_inc(&sb->s_active); 4194 4195 init_cgroup_housekeeping(cgrp); 4196 4197 dentry->d_fsdata = cgrp; 4198 cgrp->dentry = dentry; 4199 4200 cgrp->parent = parent; 4201 cgrp->root = parent->root; 4202 cgrp->top_cgroup = parent->top_cgroup; 4203 4204 if (notify_on_release(parent)) 4205 set_bit(CGRP_NOTIFY_ON_RELEASE, &cgrp->flags); 4206 4207 if (test_bit(CGRP_CPUSET_CLONE_CHILDREN, &parent->flags)) 4208 set_bit(CGRP_CPUSET_CLONE_CHILDREN, &cgrp->flags); 4209 4210 for_each_subsys(root, ss) { 4211 struct cgroup_subsys_state *css; 4212 4213 css = ss->css_alloc(cgrp); 4214 if (IS_ERR(css)) { 4215 err = PTR_ERR(css); 4216 goto err_free_all; 4217 } 4218 init_cgroup_css(css, ss, cgrp); 4219 if (ss->use_id) { 4220 err = alloc_css_id(ss, parent, cgrp); 4221 if (err) 4222 goto err_free_all; 4223 } 4224 } 4225 4226 /* 4227 * Create directory. cgroup_create_file() returns with the new 4228 * directory locked on success so that it can be populated without 4229 * dropping cgroup_mutex. 4230 */ 4231 err = cgroup_create_file(dentry, S_IFDIR | mode, sb); 4232 if (err < 0) 4233 goto err_free_all; 4234 lockdep_assert_held(&dentry->d_inode->i_mutex); 4235 4236 /* allocation complete, commit to creation */ 4237 list_add_tail(&cgrp->allcg_node, &root->allcg_list); 4238 list_add_tail_rcu(&cgrp->sibling, &cgrp->parent->children); 4239 root->number_of_cgroups++; 4240 4241 /* each css holds a ref to the cgroup's dentry */ 4242 for_each_subsys(root, ss) 4243 dget(dentry); 4244 4245 /* creation succeeded, notify subsystems */ 4246 for_each_subsys(root, ss) { 4247 err = online_css(ss, cgrp); 4248 if (err) 4249 goto err_destroy; 4250 4251 if (ss->broken_hierarchy && !ss->warned_broken_hierarchy && 4252 parent->parent) { 4253 pr_warning("cgroup: %s (%d) created nested cgroup for controller \"%s\" which has incomplete hierarchy support. Nested cgroups may change behavior in the future.\n", 4254 current->comm, current->pid, ss->name); 4255 if (!strcmp(ss->name, "memory")) 4256 pr_warning("cgroup: \"memory\" requires setting use_hierarchy to 1 on the root.\n"); 4257 ss->warned_broken_hierarchy = true; 4258 } 4259 } 4260 4261 err = cgroup_populate_dir(cgrp, true, root->subsys_mask); 4262 if (err) 4263 goto err_destroy; 4264 4265 mutex_unlock(&cgroup_mutex); 4266 mutex_unlock(&cgrp->dentry->d_inode->i_mutex); 4267 4268 return 0; 4269 4270err_free_all: 4271 for_each_subsys(root, ss) { 4272 if (cgrp->subsys[ss->subsys_id]) 4273 ss->css_free(cgrp); 4274 } 4275 mutex_unlock(&cgroup_mutex); 4276 /* Release the reference count that we took on the superblock */ 4277 deactivate_super(sb); 4278err_free_id: 4279 ida_simple_remove(&root->cgroup_ida, cgrp->id); 4280err_free_cgrp: 4281 kfree(cgrp); 4282 return err; 4283 4284err_destroy: 4285 cgroup_destroy_locked(cgrp); 4286 mutex_unlock(&cgroup_mutex); 4287 mutex_unlock(&dentry->d_inode->i_mutex); 4288 return err; 4289} 4290 4291static int cgroup_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) 4292{ 4293 struct cgroup *c_parent = dentry->d_parent->d_fsdata; 4294 4295 /* the vfs holds inode->i_mutex already */ 4296 return cgroup_create(c_parent, dentry, mode | S_IFDIR); 4297} 4298 4299/* 4300 * Check the reference count on each subsystem. Since we already 4301 * established that there are no tasks in the cgroup, if the css refcount 4302 * is also 1, then there should be no outstanding references, so the 4303 * subsystem is safe to destroy. We scan across all subsystems rather than 4304 * using the per-hierarchy linked list of mounted subsystems since we can 4305 * be called via check_for_release() with no synchronization other than 4306 * RCU, and the subsystem linked list isn't RCU-safe. 4307 */ 4308static int cgroup_has_css_refs(struct cgroup *cgrp) 4309{ 4310 int i; 4311 4312 /* 4313 * We won't need to lock the subsys array, because the subsystems 4314 * we're concerned about aren't going anywhere since our cgroup root 4315 * has a reference on them. 4316 */ 4317 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4318 struct cgroup_subsys *ss = subsys[i]; 4319 struct cgroup_subsys_state *css; 4320 4321 /* Skip subsystems not present or not in this hierarchy */ 4322 if (ss == NULL || ss->root != cgrp->root) 4323 continue; 4324 4325 css = cgrp->subsys[ss->subsys_id]; 4326 /* 4327 * When called from check_for_release() it's possible 4328 * that by this point the cgroup has been removed 4329 * and the css deleted. But a false-positive doesn't 4330 * matter, since it can only happen if the cgroup 4331 * has been deleted and hence no longer needs the 4332 * release agent to be called anyway. 4333 */ 4334 if (css && css_refcnt(css) > 1) 4335 return 1; 4336 } 4337 return 0; 4338} 4339 4340static int cgroup_destroy_locked(struct cgroup *cgrp) 4341 __releases(&cgroup_mutex) __acquires(&cgroup_mutex) 4342{ 4343 struct dentry *d = cgrp->dentry; 4344 struct cgroup *parent = cgrp->parent; 4345 DEFINE_WAIT(wait); 4346 struct cgroup_event *event, *tmp; 4347 struct cgroup_subsys *ss; 4348 LIST_HEAD(tmp_list); 4349 4350 lockdep_assert_held(&d->d_inode->i_mutex); 4351 lockdep_assert_held(&cgroup_mutex); 4352 4353 if (atomic_read(&cgrp->count) || !list_empty(&cgrp->children)) 4354 return -EBUSY; 4355 4356 /* 4357 * Block new css_tryget() by deactivating refcnt and mark @cgrp 4358 * removed. This makes future css_tryget() and child creation 4359 * attempts fail thus maintaining the removal conditions verified 4360 * above. 4361 */ 4362 for_each_subsys(cgrp->root, ss) { 4363 struct cgroup_subsys_state *css = cgrp->subsys[ss->subsys_id]; 4364 4365 WARN_ON(atomic_read(&css->refcnt) < 0); 4366 atomic_add(CSS_DEACT_BIAS, &css->refcnt); 4367 } 4368 set_bit(CGRP_REMOVED, &cgrp->flags); 4369 4370 /* tell subsystems to initate destruction */ 4371 for_each_subsys(cgrp->root, ss) 4372 offline_css(ss, cgrp); 4373 4374 /* 4375 * Put all the base refs. Each css holds an extra reference to the 4376 * cgroup's dentry and cgroup removal proceeds regardless of css 4377 * refs. On the last put of each css, whenever that may be, the 4378 * extra dentry ref is put so that dentry destruction happens only 4379 * after all css's are released. 4380 */ 4381 for_each_subsys(cgrp->root, ss) 4382 css_put(cgrp->subsys[ss->subsys_id]); 4383 4384 raw_spin_lock(&release_list_lock); 4385 if (!list_empty(&cgrp->release_list)) 4386 list_del_init(&cgrp->release_list); 4387 raw_spin_unlock(&release_list_lock); 4388 4389 /* delete this cgroup from parent->children */ 4390 list_del_rcu(&cgrp->sibling); 4391 list_del_init(&cgrp->allcg_node); 4392 4393 dget(d); 4394 cgroup_d_remove_dir(d); 4395 dput(d); 4396 4397 set_bit(CGRP_RELEASABLE, &parent->flags); 4398 check_for_release(parent); 4399 4400 /* 4401 * Unregister events and notify userspace. 4402 * Notify userspace about cgroup removing only after rmdir of cgroup 4403 * directory to avoid race between userspace and kernelspace. 4404 */ 4405 spin_lock(&cgrp->event_list_lock); 4406 list_for_each_entry_safe(event, tmp, &cgrp->event_list, list) { 4407 list_del_init(&event->list); 4408 schedule_work(&event->remove); 4409 } 4410 spin_unlock(&cgrp->event_list_lock); 4411 4412 return 0; 4413} 4414 4415static int cgroup_rmdir(struct inode *unused_dir, struct dentry *dentry) 4416{ 4417 int ret; 4418 4419 mutex_lock(&cgroup_mutex); 4420 ret = cgroup_destroy_locked(dentry->d_fsdata); 4421 mutex_unlock(&cgroup_mutex); 4422 4423 return ret; 4424} 4425 4426static void __init_or_module cgroup_init_cftsets(struct cgroup_subsys *ss) 4427{ 4428 INIT_LIST_HEAD(&ss->cftsets); 4429 4430 /* 4431 * base_cftset is embedded in subsys itself, no need to worry about 4432 * deregistration. 4433 */ 4434 if (ss->base_cftypes) { 4435 ss->base_cftset.cfts = ss->base_cftypes; 4436 list_add_tail(&ss->base_cftset.node, &ss->cftsets); 4437 } 4438} 4439 4440static void __init cgroup_init_subsys(struct cgroup_subsys *ss) 4441{ 4442 struct cgroup_subsys_state *css; 4443 4444 printk(KERN_INFO "Initializing cgroup subsys %s\n", ss->name); 4445 4446 mutex_lock(&cgroup_mutex); 4447 4448 /* init base cftset */ 4449 cgroup_init_cftsets(ss); 4450 4451 /* Create the top cgroup state for this subsystem */ 4452 list_add(&ss->sibling, &rootnode.subsys_list); 4453 ss->root = &rootnode; 4454 css = ss->css_alloc(dummytop); 4455 /* We don't handle early failures gracefully */ 4456 BUG_ON(IS_ERR(css)); 4457 init_cgroup_css(css, ss, dummytop); 4458 4459 /* Update the init_css_set to contain a subsys 4460 * pointer to this state - since the subsystem is 4461 * newly registered, all tasks and hence the 4462 * init_css_set is in the subsystem's top cgroup. */ 4463 init_css_set.subsys[ss->subsys_id] = css; 4464 4465 need_forkexit_callback |= ss->fork || ss->exit; 4466 4467 /* At system boot, before all subsystems have been 4468 * registered, no tasks have been forked, so we don't 4469 * need to invoke fork callbacks here. */ 4470 BUG_ON(!list_empty(&init_task.tasks)); 4471 4472 ss->active = 1; 4473 BUG_ON(online_css(ss, dummytop)); 4474 4475 mutex_unlock(&cgroup_mutex); 4476 4477 /* this function shouldn't be used with modular subsystems, since they 4478 * need to register a subsys_id, among other things */ 4479 BUG_ON(ss->module); 4480} 4481 4482/** 4483 * cgroup_load_subsys: load and register a modular subsystem at runtime 4484 * @ss: the subsystem to load 4485 * 4486 * This function should be called in a modular subsystem's initcall. If the 4487 * subsystem is built as a module, it will be assigned a new subsys_id and set 4488 * up for use. If the subsystem is built-in anyway, work is delegated to the 4489 * simpler cgroup_init_subsys. 4490 */ 4491int __init_or_module cgroup_load_subsys(struct cgroup_subsys *ss) 4492{ 4493 struct cgroup_subsys_state *css; 4494 int i, ret; 4495 struct hlist_node *tmp; 4496 struct css_set *cg; 4497 unsigned long key; 4498 4499 /* check name and function validity */ 4500 if (ss->name == NULL || strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN || 4501 ss->css_alloc == NULL || ss->css_free == NULL) 4502 return -EINVAL; 4503 4504 /* 4505 * we don't support callbacks in modular subsystems. this check is 4506 * before the ss->module check for consistency; a subsystem that could 4507 * be a module should still have no callbacks even if the user isn't 4508 * compiling it as one. 4509 */ 4510 if (ss->fork || ss->exit) 4511 return -EINVAL; 4512 4513 /* 4514 * an optionally modular subsystem is built-in: we want to do nothing, 4515 * since cgroup_init_subsys will have already taken care of it. 4516 */ 4517 if (ss->module == NULL) { 4518 /* a sanity check */ 4519 BUG_ON(subsys[ss->subsys_id] != ss); 4520 return 0; 4521 } 4522 4523 /* init base cftset */ 4524 cgroup_init_cftsets(ss); 4525 4526 mutex_lock(&cgroup_mutex); 4527 subsys[ss->subsys_id] = ss; 4528 4529 /* 4530 * no ss->css_alloc seems to need anything important in the ss 4531 * struct, so this can happen first (i.e. before the rootnode 4532 * attachment). 4533 */ 4534 css = ss->css_alloc(dummytop); 4535 if (IS_ERR(css)) { 4536 /* failure case - need to deassign the subsys[] slot. */ 4537 subsys[ss->subsys_id] = NULL; 4538 mutex_unlock(&cgroup_mutex); 4539 return PTR_ERR(css); 4540 } 4541 4542 list_add(&ss->sibling, &rootnode.subsys_list); 4543 ss->root = &rootnode; 4544 4545 /* our new subsystem will be attached to the dummy hierarchy. */ 4546 init_cgroup_css(css, ss, dummytop); 4547 /* init_idr must be after init_cgroup_css because it sets css->id. */ 4548 if (ss->use_id) { 4549 ret = cgroup_init_idr(ss, css); 4550 if (ret) 4551 goto err_unload; 4552 } 4553 4554 /* 4555 * Now we need to entangle the css into the existing css_sets. unlike 4556 * in cgroup_init_subsys, there are now multiple css_sets, so each one 4557 * will need a new pointer to it; done by iterating the css_set_table. 4558 * furthermore, modifying the existing css_sets will corrupt the hash 4559 * table state, so each changed css_set will need its hash recomputed. 4560 * this is all done under the css_set_lock. 4561 */ 4562 write_lock(&css_set_lock); 4563 hash_for_each_safe(css_set_table, i, tmp, cg, hlist) { 4564 /* skip entries that we already rehashed */ 4565 if (cg->subsys[ss->subsys_id]) 4566 continue; 4567 /* remove existing entry */ 4568 hash_del(&cg->hlist); 4569 /* set new value */ 4570 cg->subsys[ss->subsys_id] = css; 4571 /* recompute hash and restore entry */ 4572 key = css_set_hash(cg->subsys); 4573 hash_add(css_set_table, &cg->hlist, key); 4574 } 4575 write_unlock(&css_set_lock); 4576 4577 ss->active = 1; 4578 ret = online_css(ss, dummytop); 4579 if (ret) 4580 goto err_unload; 4581 4582 /* success! */ 4583 mutex_unlock(&cgroup_mutex); 4584 return 0; 4585 4586err_unload: 4587 mutex_unlock(&cgroup_mutex); 4588 /* @ss can't be mounted here as try_module_get() would fail */ 4589 cgroup_unload_subsys(ss); 4590 return ret; 4591} 4592EXPORT_SYMBOL_GPL(cgroup_load_subsys); 4593 4594/** 4595 * cgroup_unload_subsys: unload a modular subsystem 4596 * @ss: the subsystem to unload 4597 * 4598 * This function should be called in a modular subsystem's exitcall. When this 4599 * function is invoked, the refcount on the subsystem's module will be 0, so 4600 * the subsystem will not be attached to any hierarchy. 4601 */ 4602void cgroup_unload_subsys(struct cgroup_subsys *ss) 4603{ 4604 struct cg_cgroup_link *link; 4605 4606 BUG_ON(ss->module == NULL); 4607 4608 /* 4609 * we shouldn't be called if the subsystem is in use, and the use of 4610 * try_module_get in parse_cgroupfs_options should ensure that it 4611 * doesn't start being used while we're killing it off. 4612 */ 4613 BUG_ON(ss->root != &rootnode); 4614 4615 mutex_lock(&cgroup_mutex); 4616 4617 offline_css(ss, dummytop); 4618 ss->active = 0; 4619 4620 if (ss->use_id) 4621 idr_destroy(&ss->idr); 4622 4623 /* deassign the subsys_id */ 4624 subsys[ss->subsys_id] = NULL; 4625 4626 /* remove subsystem from rootnode's list of subsystems */ 4627 list_del_init(&ss->sibling); 4628 4629 /* 4630 * disentangle the css from all css_sets attached to the dummytop. as 4631 * in loading, we need to pay our respects to the hashtable gods. 4632 */ 4633 write_lock(&css_set_lock); 4634 list_for_each_entry(link, &dummytop->css_sets, cgrp_link_list) { 4635 struct css_set *cg = link->cg; 4636 unsigned long key; 4637 4638 hash_del(&cg->hlist); 4639 cg->subsys[ss->subsys_id] = NULL; 4640 key = css_set_hash(cg->subsys); 4641 hash_add(css_set_table, &cg->hlist, key); 4642 } 4643 write_unlock(&css_set_lock); 4644 4645 /* 4646 * remove subsystem's css from the dummytop and free it - need to 4647 * free before marking as null because ss->css_free needs the 4648 * cgrp->subsys pointer to find their state. note that this also 4649 * takes care of freeing the css_id. 4650 */ 4651 ss->css_free(dummytop); 4652 dummytop->subsys[ss->subsys_id] = NULL; 4653 4654 mutex_unlock(&cgroup_mutex); 4655} 4656EXPORT_SYMBOL_GPL(cgroup_unload_subsys); 4657 4658/** 4659 * cgroup_init_early - cgroup initialization at system boot 4660 * 4661 * Initialize cgroups at system boot, and initialize any 4662 * subsystems that request early init. 4663 */ 4664int __init cgroup_init_early(void) 4665{ 4666 int i; 4667 atomic_set(&init_css_set.refcount, 1); 4668 INIT_LIST_HEAD(&init_css_set.cg_links); 4669 INIT_LIST_HEAD(&init_css_set.tasks); 4670 INIT_HLIST_NODE(&init_css_set.hlist); 4671 css_set_count = 1; 4672 init_cgroup_root(&rootnode); 4673 root_count = 1; 4674 init_task.cgroups = &init_css_set; 4675 4676 init_css_set_link.cg = &init_css_set; 4677 init_css_set_link.cgrp = dummytop; 4678 list_add(&init_css_set_link.cgrp_link_list, 4679 &rootnode.top_cgroup.css_sets); 4680 list_add(&init_css_set_link.cg_link_list, 4681 &init_css_set.cg_links); 4682 4683 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4684 struct cgroup_subsys *ss = subsys[i]; 4685 4686 /* at bootup time, we don't worry about modular subsystems */ 4687 if (!ss || ss->module) 4688 continue; 4689 4690 BUG_ON(!ss->name); 4691 BUG_ON(strlen(ss->name) > MAX_CGROUP_TYPE_NAMELEN); 4692 BUG_ON(!ss->css_alloc); 4693 BUG_ON(!ss->css_free); 4694 if (ss->subsys_id != i) { 4695 printk(KERN_ERR "cgroup: Subsys %s id == %d\n", 4696 ss->name, ss->subsys_id); 4697 BUG(); 4698 } 4699 4700 if (ss->early_init) 4701 cgroup_init_subsys(ss); 4702 } 4703 return 0; 4704} 4705 4706/** 4707 * cgroup_init - cgroup initialization 4708 * 4709 * Register cgroup filesystem and /proc file, and initialize 4710 * any subsystems that didn't request early init. 4711 */ 4712int __init cgroup_init(void) 4713{ 4714 int err; 4715 int i; 4716 unsigned long key; 4717 4718 err = bdi_init(&cgroup_backing_dev_info); 4719 if (err) 4720 return err; 4721 4722 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4723 struct cgroup_subsys *ss = subsys[i]; 4724 4725 /* at bootup time, we don't worry about modular subsystems */ 4726 if (!ss || ss->module) 4727 continue; 4728 if (!ss->early_init) 4729 cgroup_init_subsys(ss); 4730 if (ss->use_id) 4731 cgroup_init_idr(ss, init_css_set.subsys[ss->subsys_id]); 4732 } 4733 4734 /* Add init_css_set to the hash table */ 4735 key = css_set_hash(init_css_set.subsys); 4736 hash_add(css_set_table, &init_css_set.hlist, key); 4737 BUG_ON(!init_root_id(&rootnode)); 4738 4739 cgroup_kobj = kobject_create_and_add("cgroup", fs_kobj); 4740 if (!cgroup_kobj) { 4741 err = -ENOMEM; 4742 goto out; 4743 } 4744 4745 err = register_filesystem(&cgroup_fs_type); 4746 if (err < 0) { 4747 kobject_put(cgroup_kobj); 4748 goto out; 4749 } 4750 4751 proc_create("cgroups", 0, NULL, &proc_cgroupstats_operations); 4752 4753out: 4754 if (err) 4755 bdi_destroy(&cgroup_backing_dev_info); 4756 4757 return err; 4758} 4759 4760/* 4761 * proc_cgroup_show() 4762 * - Print task's cgroup paths into seq_file, one line for each hierarchy 4763 * - Used for /proc/ /cgroup. 4764 * - No need to task_lock(tsk) on this tsk->cgroup reference, as it 4765 * doesn't really matter if tsk->cgroup changes after we read it, 4766 * and we take cgroup_mutex, keeping cgroup_attach_task() from changing it 4767 * anyway. No need to check that tsk->cgroup != NULL, thanks to 4768 * the_top_cgroup_hack in cgroup_exit(), which sets an exiting tasks 4769 * cgroup to top_cgroup. 4770 */ 4771 4772/* TODO: Use a proper seq_file iterator */ 4773static int proc_cgroup_show(struct seq_file *m, void *v) 4774{ 4775 struct pid *pid; 4776 struct task_struct *tsk; 4777 char *buf; 4778 int retval; 4779 struct cgroupfs_root *root; 4780 4781 retval = -ENOMEM; 4782 buf = kmalloc(PAGE_SIZE, GFP_KERNEL); 4783 if (!buf) 4784 goto out; 4785 4786 retval = -ESRCH; 4787 pid = m->private; 4788 tsk = get_pid_task(pid, PIDTYPE_PID); 4789 if (!tsk) 4790 goto out_free; 4791 4792 retval = 0; 4793 4794 mutex_lock(&cgroup_mutex); 4795 4796 for_each_active_root(root) { 4797 struct cgroup_subsys *ss; 4798 struct cgroup *cgrp; 4799 int count = 0; 4800 4801 seq_printf(m, "%d:", root->hierarchy_id); 4802 for_each_subsys(root, ss) 4803 seq_printf(m, "%s%s", count++ ? "," : "", ss->name); 4804 if (strlen(root->name)) 4805 seq_printf(m, "%sname=%s", count ? "," : "", 4806 root->name); 4807 seq_putc(m, ':'); 4808 cgrp = task_cgroup_from_root(tsk, root); 4809 retval = cgroup_path(cgrp, buf, PAGE_SIZE); 4810 if (retval < 0) 4811 goto out_unlock; 4812 seq_puts(m, buf); 4813 seq_putc(m, '\n'); 4814 } 4815 4816out_unlock: 4817 mutex_unlock(&cgroup_mutex); 4818 put_task_struct(tsk); 4819out_free: 4820 kfree(buf); 4821out: 4822 return retval; 4823} 4824 4825static int cgroup_open(struct inode *inode, struct file *file) 4826{ 4827 struct pid *pid = PROC_I(inode)->pid; 4828 return single_open(file, proc_cgroup_show, pid); 4829} 4830 4831const struct file_operations proc_cgroup_operations = { 4832 .open = cgroup_open, 4833 .read = seq_read, 4834 .llseek = seq_lseek, 4835 .release = single_release, 4836}; 4837 4838/* Display information about each subsystem and each hierarchy */ 4839static int proc_cgroupstats_show(struct seq_file *m, void *v) 4840{ 4841 int i; 4842 4843 seq_puts(m, "#subsys_name\thierarchy\tnum_cgroups\tenabled\n"); 4844 /* 4845 * ideally we don't want subsystems moving around while we do this. 4846 * cgroup_mutex is also necessary to guarantee an atomic snapshot of 4847 * subsys/hierarchy state. 4848 */ 4849 mutex_lock(&cgroup_mutex); 4850 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4851 struct cgroup_subsys *ss = subsys[i]; 4852 if (ss == NULL) 4853 continue; 4854 seq_printf(m, "%s\t%d\t%d\t%d\n", 4855 ss->name, ss->root->hierarchy_id, 4856 ss->root->number_of_cgroups, !ss->disabled); 4857 } 4858 mutex_unlock(&cgroup_mutex); 4859 return 0; 4860} 4861 4862static int cgroupstats_open(struct inode *inode, struct file *file) 4863{ 4864 return single_open(file, proc_cgroupstats_show, NULL); 4865} 4866 4867static const struct file_operations proc_cgroupstats_operations = { 4868 .open = cgroupstats_open, 4869 .read = seq_read, 4870 .llseek = seq_lseek, 4871 .release = single_release, 4872}; 4873 4874/** 4875 * cgroup_fork - attach newly forked task to its parents cgroup. 4876 * @child: pointer to task_struct of forking parent process. 4877 * 4878 * Description: A task inherits its parent's cgroup at fork(). 4879 * 4880 * A pointer to the shared css_set was automatically copied in 4881 * fork.c by dup_task_struct(). However, we ignore that copy, since 4882 * it was not made under the protection of RCU or cgroup_mutex, so 4883 * might no longer be a valid cgroup pointer. cgroup_attach_task() might 4884 * have already changed current->cgroups, allowing the previously 4885 * referenced cgroup group to be removed and freed. 4886 * 4887 * At the point that cgroup_fork() is called, 'current' is the parent 4888 * task, and the passed argument 'child' points to the child task. 4889 */ 4890void cgroup_fork(struct task_struct *child) 4891{ 4892 task_lock(current); 4893 child->cgroups = current->cgroups; 4894 get_css_set(child->cgroups); 4895 task_unlock(current); 4896 INIT_LIST_HEAD(&child->cg_list); 4897} 4898 4899/** 4900 * cgroup_post_fork - called on a new task after adding it to the task list 4901 * @child: the task in question 4902 * 4903 * Adds the task to the list running through its css_set if necessary and 4904 * call the subsystem fork() callbacks. Has to be after the task is 4905 * visible on the task list in case we race with the first call to 4906 * cgroup_iter_start() - to guarantee that the new task ends up on its 4907 * list. 4908 */ 4909void cgroup_post_fork(struct task_struct *child) 4910{ 4911 int i; 4912 4913 /* 4914 * use_task_css_set_links is set to 1 before we walk the tasklist 4915 * under the tasklist_lock and we read it here after we added the child 4916 * to the tasklist under the tasklist_lock as well. If the child wasn't 4917 * yet in the tasklist when we walked through it from 4918 * cgroup_enable_task_cg_lists(), then use_task_css_set_links value 4919 * should be visible now due to the paired locking and barriers implied 4920 * by LOCK/UNLOCK: it is written before the tasklist_lock unlock 4921 * in cgroup_enable_task_cg_lists() and read here after the tasklist_lock 4922 * lock on fork. 4923 */ 4924 if (use_task_css_set_links) { 4925 write_lock(&css_set_lock); 4926 task_lock(child); 4927 if (list_empty(&child->cg_list)) 4928 list_add(&child->cg_list, &child->cgroups->tasks); 4929 task_unlock(child); 4930 write_unlock(&css_set_lock); 4931 } 4932 4933 /* 4934 * Call ss->fork(). This must happen after @child is linked on 4935 * css_set; otherwise, @child might change state between ->fork() 4936 * and addition to css_set. 4937 */ 4938 if (need_forkexit_callback) { 4939 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 4940 struct cgroup_subsys *ss = subsys[i]; 4941 4942 /* 4943 * fork/exit callbacks are supported only for 4944 * builtin subsystems and we don't need further 4945 * synchronization as they never go away. 4946 */ 4947 if (!ss || ss->module) 4948 continue; 4949 4950 if (ss->fork) 4951 ss->fork(child); 4952 } 4953 } 4954} 4955 4956/** 4957 * cgroup_exit - detach cgroup from exiting task 4958 * @tsk: pointer to task_struct of exiting process 4959 * @run_callback: run exit callbacks? 4960 * 4961 * Description: Detach cgroup from @tsk and release it. 4962 * 4963 * Note that cgroups marked notify_on_release force every task in 4964 * them to take the global cgroup_mutex mutex when exiting. 4965 * This could impact scaling on very large systems. Be reluctant to 4966 * use notify_on_release cgroups where very high task exit scaling 4967 * is required on large systems. 4968 * 4969 * the_top_cgroup_hack: 4970 * 4971 * Set the exiting tasks cgroup to the root cgroup (top_cgroup). 4972 * 4973 * We call cgroup_exit() while the task is still competent to 4974 * handle notify_on_release(), then leave the task attached to the 4975 * root cgroup in each hierarchy for the remainder of its exit. 4976 * 4977 * To do this properly, we would increment the reference count on 4978 * top_cgroup, and near the very end of the kernel/exit.c do_exit() 4979 * code we would add a second cgroup function call, to drop that 4980 * reference. This would just create an unnecessary hot spot on 4981 * the top_cgroup reference count, to no avail. 4982 * 4983 * Normally, holding a reference to a cgroup without bumping its 4984 * count is unsafe. The cgroup could go away, or someone could 4985 * attach us to a different cgroup, decrementing the count on 4986 * the first cgroup that we never incremented. But in this case, 4987 * top_cgroup isn't going away, and either task has PF_EXITING set, 4988 * which wards off any cgroup_attach_task() attempts, or task is a failed 4989 * fork, never visible to cgroup_attach_task. 4990 */ 4991void cgroup_exit(struct task_struct *tsk, int run_callbacks) 4992{ 4993 struct css_set *cg; 4994 int i; 4995 4996 /* 4997 * Unlink from the css_set task list if necessary. 4998 * Optimistically check cg_list before taking 4999 * css_set_lock 5000 */ 5001 if (!list_empty(&tsk->cg_list)) { 5002 write_lock(&css_set_lock); 5003 if (!list_empty(&tsk->cg_list)) 5004 list_del_init(&tsk->cg_list); 5005 write_unlock(&css_set_lock); 5006 } 5007 5008 /* Reassign the task to the init_css_set. */ 5009 task_lock(tsk); 5010 cg = tsk->cgroups; 5011 tsk->cgroups = &init_css_set; 5012 5013 if (run_callbacks && need_forkexit_callback) { 5014 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5015 struct cgroup_subsys *ss = subsys[i]; 5016 5017 /* modular subsystems can't use callbacks */ 5018 if (!ss || ss->module) 5019 continue; 5020 5021 if (ss->exit) { 5022 struct cgroup *old_cgrp = 5023 rcu_dereference_raw(cg->subsys[i])->cgroup; 5024 struct cgroup *cgrp = task_cgroup(tsk, i); 5025 ss->exit(cgrp, old_cgrp, tsk); 5026 } 5027 } 5028 } 5029 task_unlock(tsk); 5030 5031 put_css_set_taskexit(cg); 5032} 5033 5034/** 5035 * cgroup_is_descendant - see if @cgrp is a descendant of @task's cgrp 5036 * @cgrp: the cgroup in question 5037 * @task: the task in question 5038 * 5039 * See if @cgrp is a descendant of @task's cgroup in the appropriate 5040 * hierarchy. 5041 * 5042 * If we are sending in dummytop, then presumably we are creating 5043 * the top cgroup in the subsystem. 5044 * 5045 * Called only by the ns (nsproxy) cgroup. 5046 */ 5047int cgroup_is_descendant(const struct cgroup *cgrp, struct task_struct *task) 5048{ 5049 int ret; 5050 struct cgroup *target; 5051 5052 if (cgrp == dummytop) 5053 return 1; 5054 5055 target = task_cgroup_from_root(task, cgrp->root); 5056 while (cgrp != target && cgrp!= cgrp->top_cgroup) 5057 cgrp = cgrp->parent; 5058 ret = (cgrp == target); 5059 return ret; 5060} 5061 5062static void check_for_release(struct cgroup *cgrp) 5063{ 5064 /* All of these checks rely on RCU to keep the cgroup 5065 * structure alive */ 5066 if (cgroup_is_releasable(cgrp) && !atomic_read(&cgrp->count) 5067 && list_empty(&cgrp->children) && !cgroup_has_css_refs(cgrp)) { 5068 /* Control Group is currently removeable. If it's not 5069 * already queued for a userspace notification, queue 5070 * it now */ 5071 int need_schedule_work = 0; 5072 raw_spin_lock(&release_list_lock); 5073 if (!cgroup_is_removed(cgrp) && 5074 list_empty(&cgrp->release_list)) { 5075 list_add(&cgrp->release_list, &release_list); 5076 need_schedule_work = 1; 5077 } 5078 raw_spin_unlock(&release_list_lock); 5079 if (need_schedule_work) 5080 schedule_work(&release_agent_work); 5081 } 5082} 5083 5084/* Caller must verify that the css is not for root cgroup */ 5085bool __css_tryget(struct cgroup_subsys_state *css) 5086{ 5087 while (true) { 5088 int t, v; 5089 5090 v = css_refcnt(css); 5091 t = atomic_cmpxchg(&css->refcnt, v, v + 1); 5092 if (likely(t == v)) 5093 return true; 5094 else if (t < 0) 5095 return false; 5096 cpu_relax(); 5097 } 5098} 5099EXPORT_SYMBOL_GPL(__css_tryget); 5100 5101/* Caller must verify that the css is not for root cgroup */ 5102void __css_put(struct cgroup_subsys_state *css) 5103{ 5104 struct cgroup *cgrp = css->cgroup; 5105 int v; 5106 5107 rcu_read_lock(); 5108 v = css_unbias_refcnt(atomic_dec_return(&css->refcnt)); 5109 5110 switch (v) { 5111 case 1: 5112 if (notify_on_release(cgrp)) { 5113 set_bit(CGRP_RELEASABLE, &cgrp->flags); 5114 check_for_release(cgrp); 5115 } 5116 break; 5117 case 0: 5118 schedule_work(&css->dput_work); 5119 break; 5120 } 5121 rcu_read_unlock(); 5122} 5123EXPORT_SYMBOL_GPL(__css_put); 5124 5125/* 5126 * Notify userspace when a cgroup is released, by running the 5127 * configured release agent with the name of the cgroup (path 5128 * relative to the root of cgroup file system) as the argument. 5129 * 5130 * Most likely, this user command will try to rmdir this cgroup. 5131 * 5132 * This races with the possibility that some other task will be 5133 * attached to this cgroup before it is removed, or that some other 5134 * user task will 'mkdir' a child cgroup of this cgroup. That's ok. 5135 * The presumed 'rmdir' will fail quietly if this cgroup is no longer 5136 * unused, and this cgroup will be reprieved from its death sentence, 5137 * to continue to serve a useful existence. Next time it's released, 5138 * we will get notified again, if it still has 'notify_on_release' set. 5139 * 5140 * The final arg to call_usermodehelper() is UMH_WAIT_EXEC, which 5141 * means only wait until the task is successfully execve()'d. The 5142 * separate release agent task is forked by call_usermodehelper(), 5143 * then control in this thread returns here, without waiting for the 5144 * release agent task. We don't bother to wait because the caller of 5145 * this routine has no use for the exit status of the release agent 5146 * task, so no sense holding our caller up for that. 5147 */ 5148static void cgroup_release_agent(struct work_struct *work) 5149{ 5150 BUG_ON(work != &release_agent_work); 5151 mutex_lock(&cgroup_mutex); 5152 raw_spin_lock(&release_list_lock); 5153 while (!list_empty(&release_list)) { 5154 char *argv[3], *envp[3]; 5155 int i; 5156 char *pathbuf = NULL, *agentbuf = NULL; 5157 struct cgroup *cgrp = list_entry(release_list.next, 5158 struct cgroup, 5159 release_list); 5160 list_del_init(&cgrp->release_list); 5161 raw_spin_unlock(&release_list_lock); 5162 pathbuf = kmalloc(PAGE_SIZE, GFP_KERNEL); 5163 if (!pathbuf) 5164 goto continue_free; 5165 if (cgroup_path(cgrp, pathbuf, PAGE_SIZE) < 0) 5166 goto continue_free; 5167 agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL); 5168 if (!agentbuf) 5169 goto continue_free; 5170 5171 i = 0; 5172 argv[i++] = agentbuf; 5173 argv[i++] = pathbuf; 5174 argv[i] = NULL; 5175 5176 i = 0; 5177 /* minimal command environment */ 5178 envp[i++] = "HOME=/"; 5179 envp[i++] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin"; 5180 envp[i] = NULL; 5181 5182 /* Drop the lock while we invoke the usermode helper, 5183 * since the exec could involve hitting disk and hence 5184 * be a slow process */ 5185 mutex_unlock(&cgroup_mutex); 5186 call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC); 5187 mutex_lock(&cgroup_mutex); 5188 continue_free: 5189 kfree(pathbuf); 5190 kfree(agentbuf); 5191 raw_spin_lock(&release_list_lock); 5192 } 5193 raw_spin_unlock(&release_list_lock); 5194 mutex_unlock(&cgroup_mutex); 5195} 5196 5197static int __init cgroup_disable(char *str) 5198{ 5199 int i; 5200 char *token; 5201 5202 while ((token = strsep(&str, ",")) != NULL) { 5203 if (!*token) 5204 continue; 5205 for (i = 0; i < CGROUP_SUBSYS_COUNT; i++) { 5206 struct cgroup_subsys *ss = subsys[i]; 5207 5208 /* 5209 * cgroup_disable, being at boot time, can't 5210 * know about module subsystems, so we don't 5211 * worry about them. 5212 */ 5213 if (!ss || ss->module) 5214 continue; 5215 5216 if (!strcmp(token, ss->name)) { 5217 ss->disabled = 1; 5218 printk(KERN_INFO "Disabling %s control group" 5219 " subsystem\n", ss->name); 5220 break; 5221 } 5222 } 5223 } 5224 return 1; 5225} 5226__setup("cgroup_disable=", cgroup_disable); 5227 5228/* 5229 * Functons for CSS ID. 5230 */ 5231 5232/* 5233 *To get ID other than 0, this should be called when !cgroup_is_removed(). 5234 */ 5235unsigned short css_id(struct cgroup_subsys_state *css) 5236{ 5237 struct css_id *cssid; 5238 5239 /* 5240 * This css_id() can return correct value when somone has refcnt 5241 * on this or this is under rcu_read_lock(). Once css->id is allocated, 5242 * it's unchanged until freed. 5243 */ 5244 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5245 5246 if (cssid) 5247 return cssid->id; 5248 return 0; 5249} 5250EXPORT_SYMBOL_GPL(css_id); 5251 5252unsigned short css_depth(struct cgroup_subsys_state *css) 5253{ 5254 struct css_id *cssid; 5255 5256 cssid = rcu_dereference_check(css->id, css_refcnt(css)); 5257 5258 if (cssid) 5259 return cssid->depth; 5260 return 0; 5261} 5262EXPORT_SYMBOL_GPL(css_depth); 5263 5264/** 5265 * css_is_ancestor - test "root" css is an ancestor of "child" 5266 * @child: the css to be tested. 5267 * @root: the css supporsed to be an ancestor of the child. 5268 * 5269 * Returns true if "root" is an ancestor of "child" in its hierarchy. Because 5270 * this function reads css->id, the caller must hold rcu_read_lock(). 5271 * But, considering usual usage, the csses should be valid objects after test. 5272 * Assuming that the caller will do some action to the child if this returns 5273 * returns true, the caller must take "child";s reference count. 5274 * If "child" is valid object and this returns true, "root" is valid, too. 5275 */ 5276 5277bool css_is_ancestor(struct cgroup_subsys_state *child, 5278 const struct cgroup_subsys_state *root) 5279{ 5280 struct css_id *child_id; 5281 struct css_id *root_id; 5282 5283 child_id = rcu_dereference(child->id); 5284 if (!child_id) 5285 return false; 5286 root_id = rcu_dereference(root->id); 5287 if (!root_id) 5288 return false; 5289 if (child_id->depth < root_id->depth) 5290 return false; 5291 if (child_id->stack[root_id->depth] != root_id->id) 5292 return false; 5293 return true; 5294} 5295 5296void free_css_id(struct cgroup_subsys *ss, struct cgroup_subsys_state *css) 5297{ 5298 struct css_id *id = css->id; 5299 /* When this is called before css_id initialization, id can be NULL */ 5300 if (!id) 5301 return; 5302 5303 BUG_ON(!ss->use_id); 5304 5305 rcu_assign_pointer(id->css, NULL); 5306 rcu_assign_pointer(css->id, NULL); 5307 spin_lock(&ss->id_lock); 5308 idr_remove(&ss->idr, id->id); 5309 spin_unlock(&ss->id_lock); 5310 kfree_rcu(id, rcu_head); 5311} 5312EXPORT_SYMBOL_GPL(free_css_id); 5313 5314/* 5315 * This is called by init or create(). Then, calls to this function are 5316 * always serialized (By cgroup_mutex() at create()). 5317 */ 5318 5319static struct css_id *get_new_cssid(struct cgroup_subsys *ss, int depth) 5320{ 5321 struct css_id *newid; 5322 int ret, size; 5323 5324 BUG_ON(!ss->use_id); 5325 5326 size = sizeof(*newid) + sizeof(unsigned short) * (depth + 1); 5327 newid = kzalloc(size, GFP_KERNEL); 5328 if (!newid) 5329 return ERR_PTR(-ENOMEM); 5330 5331 idr_preload(GFP_KERNEL); 5332 spin_lock(&ss->id_lock); 5333 /* Don't use 0. allocates an ID of 1-65535 */ 5334 ret = idr_alloc(&ss->idr, newid, 1, CSS_ID_MAX + 1, GFP_NOWAIT); 5335 spin_unlock(&ss->id_lock); 5336 idr_preload_end(); 5337 5338 /* Returns error when there are no free spaces for new ID.*/ 5339 if (ret < 0) 5340 goto err_out; 5341 5342 newid->id = ret; 5343 newid->depth = depth; 5344 return newid; 5345err_out: 5346 kfree(newid); 5347 return ERR_PTR(ret); 5348 5349} 5350 5351static int __init_or_module cgroup_init_idr(struct cgroup_subsys *ss, 5352 struct cgroup_subsys_state *rootcss) 5353{ 5354 struct css_id *newid; 5355 5356 spin_lock_init(&ss->id_lock); 5357 idr_init(&ss->idr); 5358 5359 newid = get_new_cssid(ss, 0); 5360 if (IS_ERR(newid)) 5361 return PTR_ERR(newid); 5362 5363 newid->stack[0] = newid->id; 5364 newid->css = rootcss; 5365 rootcss->id = newid; 5366 return 0; 5367} 5368 5369static int alloc_css_id(struct cgroup_subsys *ss, struct cgroup *parent, 5370 struct cgroup *child) 5371{ 5372 int subsys_id, i, depth = 0; 5373 struct cgroup_subsys_state *parent_css, *child_css; 5374 struct css_id *child_id, *parent_id; 5375 5376 subsys_id = ss->subsys_id; 5377 parent_css = parent->subsys[subsys_id]; 5378 child_css = child->subsys[subsys_id]; 5379 parent_id = parent_css->id; 5380 depth = parent_id->depth + 1; 5381 5382 child_id = get_new_cssid(ss, depth); 5383 if (IS_ERR(child_id)) 5384 return PTR_ERR(child_id); 5385 5386 for (i = 0; i < depth; i++) 5387 child_id->stack[i] = parent_id->stack[i]; 5388 child_id->stack[depth] = child_id->id; 5389 /* 5390 * child_id->css pointer will be set after this cgroup is available 5391 * see cgroup_populate_dir() 5392 */ 5393 rcu_assign_pointer(child_css->id, child_id); 5394 5395 return 0; 5396} 5397 5398/** 5399 * css_lookup - lookup css by id 5400 * @ss: cgroup subsys to be looked into. 5401 * @id: the id 5402 * 5403 * Returns pointer to cgroup_subsys_state if there is valid one with id. 5404 * NULL if not. Should be called under rcu_read_lock() 5405 */ 5406struct cgroup_subsys_state *css_lookup(struct cgroup_subsys *ss, int id) 5407{ 5408 struct css_id *cssid = NULL; 5409 5410 BUG_ON(!ss->use_id); 5411 cssid = idr_find(&ss->idr, id); 5412 5413 if (unlikely(!cssid)) 5414 return NULL; 5415 5416 return rcu_dereference(cssid->css); 5417} 5418EXPORT_SYMBOL_GPL(css_lookup); 5419 5420/** 5421 * css_get_next - lookup next cgroup under specified hierarchy. 5422 * @ss: pointer to subsystem 5423 * @id: current position of iteration. 5424 * @root: pointer to css. search tree under this. 5425 * @foundid: position of found object. 5426 * 5427 * Search next css under the specified hierarchy of rootid. Calling under 5428 * rcu_read_lock() is necessary. Returns NULL if it reaches the end. 5429 */ 5430struct cgroup_subsys_state * 5431css_get_next(struct cgroup_subsys *ss, int id, 5432 struct cgroup_subsys_state *root, int *foundid) 5433{ 5434 struct cgroup_subsys_state *ret = NULL; 5435 struct css_id *tmp; 5436 int tmpid; 5437 int rootid = css_id(root); 5438 int depth = css_depth(root); 5439 5440 if (!rootid) 5441 return NULL; 5442 5443 BUG_ON(!ss->use_id); 5444 WARN_ON_ONCE(!rcu_read_lock_held()); 5445 5446 /* fill start point for scan */ 5447 tmpid = id; 5448 while (1) { 5449 /* 5450 * scan next entry from bitmap(tree), tmpid is updated after 5451 * idr_get_next(). 5452 */ 5453 tmp = idr_get_next(&ss->idr, &tmpid); 5454 if (!tmp) 5455 break; 5456 if (tmp->depth >= depth && tmp->stack[depth] == rootid) { 5457 ret = rcu_dereference(tmp->css); 5458 if (ret) { 5459 *foundid = tmpid; 5460 break; 5461 } 5462 } 5463 /* continue to scan from next id */ 5464 tmpid = tmpid + 1; 5465 } 5466 return ret; 5467} 5468 5469/* 5470 * get corresponding css from file open on cgroupfs directory 5471 */ 5472struct cgroup_subsys_state *cgroup_css_from_dir(struct file *f, int id) 5473{ 5474 struct cgroup *cgrp; 5475 struct inode *inode; 5476 struct cgroup_subsys_state *css; 5477 5478 inode = file_inode(f); 5479 /* check in cgroup filesystem dir */ 5480 if (inode->i_op != &cgroup_dir_inode_operations) 5481 return ERR_PTR(-EBADF); 5482 5483 if (id < 0 || id >= CGROUP_SUBSYS_COUNT) 5484 return ERR_PTR(-EINVAL); 5485 5486 /* get cgroup */ 5487 cgrp = __d_cgrp(f->f_dentry); 5488 css = cgrp->subsys[id]; 5489 return css ? css : ERR_PTR(-ENOENT); 5490} 5491 5492#ifdef CONFIG_CGROUP_DEBUG 5493static struct cgroup_subsys_state *debug_css_alloc(struct cgroup *cont) 5494{ 5495 struct cgroup_subsys_state *css = kzalloc(sizeof(*css), GFP_KERNEL); 5496 5497 if (!css) 5498 return ERR_PTR(-ENOMEM); 5499 5500 return css; 5501} 5502 5503static void debug_css_free(struct cgroup *cont) 5504{ 5505 kfree(cont->subsys[debug_subsys_id]); 5506} 5507 5508static u64 cgroup_refcount_read(struct cgroup *cont, struct cftype *cft) 5509{ 5510 return atomic_read(&cont->count); 5511} 5512 5513static u64 debug_taskcount_read(struct cgroup *cont, struct cftype *cft) 5514{ 5515 return cgroup_task_count(cont); 5516} 5517 5518static u64 current_css_set_read(struct cgroup *cont, struct cftype *cft) 5519{ 5520 return (u64)(unsigned long)current->cgroups; 5521} 5522 5523static u64 current_css_set_refcount_read(struct cgroup *cont, 5524 struct cftype *cft) 5525{ 5526 u64 count; 5527 5528 rcu_read_lock(); 5529 count = atomic_read(¤t->cgroups->refcount); 5530 rcu_read_unlock(); 5531 return count; 5532} 5533 5534static int current_css_set_cg_links_read(struct cgroup *cont, 5535 struct cftype *cft, 5536 struct seq_file *seq) 5537{ 5538 struct cg_cgroup_link *link; 5539 struct css_set *cg; 5540 5541 read_lock(&css_set_lock); 5542 rcu_read_lock(); 5543 cg = rcu_dereference(current->cgroups); 5544 list_for_each_entry(link, &cg->cg_links, cg_link_list) { 5545 struct cgroup *c = link->cgrp; 5546 const char *name; 5547 5548 if (c->dentry) 5549 name = c->dentry->d_name.name; 5550 else 5551 name = "?"; 5552 seq_printf(seq, "Root %d group %s\n", 5553 c->root->hierarchy_id, name); 5554 } 5555 rcu_read_unlock(); 5556 read_unlock(&css_set_lock); 5557 return 0; 5558} 5559 5560#define MAX_TASKS_SHOWN_PER_CSS 25 5561static int cgroup_css_links_read(struct cgroup *cont, 5562 struct cftype *cft, 5563 struct seq_file *seq) 5564{ 5565 struct cg_cgroup_link *link; 5566 5567 read_lock(&css_set_lock); 5568 list_for_each_entry(link, &cont->css_sets, cgrp_link_list) { 5569 struct css_set *cg = link->cg; 5570 struct task_struct *task; 5571 int count = 0; 5572 seq_printf(seq, "css_set %p\n", cg); 5573 list_for_each_entry(task, &cg->tasks, cg_list) { 5574 if (count++ > MAX_TASKS_SHOWN_PER_CSS) { 5575 seq_puts(seq, " ...\n"); 5576 break; 5577 } else { 5578 seq_printf(seq, " task %d\n", 5579 task_pid_vnr(task)); 5580 } 5581 } 5582 } 5583 read_unlock(&css_set_lock); 5584 return 0; 5585} 5586 5587static u64 releasable_read(struct cgroup *cgrp, struct cftype *cft) 5588{ 5589 return test_bit(CGRP_RELEASABLE, &cgrp->flags); 5590} 5591 5592static struct cftype debug_files[] = { 5593 { 5594 .name = "cgroup_refcount", 5595 .read_u64 = cgroup_refcount_read, 5596 }, 5597 { 5598 .name = "taskcount", 5599 .read_u64 = debug_taskcount_read, 5600 }, 5601 5602 { 5603 .name = "current_css_set", 5604 .read_u64 = current_css_set_read, 5605 }, 5606 5607 { 5608 .name = "current_css_set_refcount", 5609 .read_u64 = current_css_set_refcount_read, 5610 }, 5611 5612 { 5613 .name = "current_css_set_cg_links", 5614 .read_seq_string = current_css_set_cg_links_read, 5615 }, 5616 5617 { 5618 .name = "cgroup_css_links", 5619 .read_seq_string = cgroup_css_links_read, 5620 }, 5621 5622 { 5623 .name = "releasable", 5624 .read_u64 = releasable_read, 5625 }, 5626 5627 { } /* terminate */ 5628}; 5629 5630struct cgroup_subsys debug_subsys = { 5631 .name = "debug", 5632 .css_alloc = debug_css_alloc, 5633 .css_free = debug_css_free, 5634 .subsys_id = debug_subsys_id, 5635 .base_cftypes = debug_files, 5636}; 5637#endif /* CONFIG_CGROUP_DEBUG */ 5638 The original LXR software by the LXR community, this experimental version by lxr@linux.no. lxr.linux.no kindly hosted by Redpill Linpro AS, provider of Linux consulting and operations services since 1995.
J'appelle ça le phénomène du dindon devant la centrale nucléaire...
Phénomène qui a été décrit dans matrix. Derrière toute chose aujourd'hui il y a du code source, et le comprenez vous ou pas ? Savez vous ce que ça veut dire ? pouvez vous imaginer ce que l'on peut faire avec ?
Je reviens de SF et à cette occasion on a rencontré des anciens d'EPITECH dont un qui a monté une boite dont il a assuré le développement comme suit : viralité Facebook en utilisant leurs API (les fonctions fournis par Facebook pour permettre l'intégration d'applications non Facebook, dans Facebook).
EPITECH, grande école de commerce devant l'éternelle (c'est du 2nd degrès je précise) a donc enseigné la tehcnique du Marketing virale a ce jeune homme (c'est toujours du 2nd degrès).
En quelques 6 mois cette application a rencontré un succès fou. Si on demande à un marketeux c'est quoi un succès fou... disons que quelques dizaine de millions d'utilisateurs... c'est pas mal. La c'est fou ! Ils ont pris plus de 120 millions d'utilisateurs !
Le temps de faire la spin-off de leur boite, et de terminer le marketing virale en 9 mois donc.... ils ont été racheté 60 millions de $ ! O my god !!!
L'aspect le pus intéressant de cette histoire c'est que personne a part un informaticien, ne peut penser qu'en utilisant (en comprenant, en pliant...) l'API Facebook, et a condition d'avoir un vrai soft pas trop pourris, on peut, à 7 monter une opération marketing de malade. Je ne pense pas qu'il soit possible pour une personne qui n'a fait "que marketing" de monter une telle opération en 2013, sauf si... elle sait coder.
La situation est d'une simplicité sans nom, soit vous ne voulez/pouvez pas apprendre le code et le monde de demain ressemblera à ça :
Ou alors vous verrez ça