mirror of
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git
synced 2024-10-31 16:38:12 +00:00
Merge branch 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup
Pull cgroup updates from Tejun Heo: - Christian extended clone3 so that processes can be spawned into cgroups directly. This is not only neat in terms of semantics but also avoids grabbing the global cgroup_threadgroup_rwsem for migration. - Daniel added !root xattr support to cgroupfs. Userland already uses xattrs on cgroupfs for bookkeeping. This will allow delegated cgroups to support such usages. - Prateek tried to make cpuset hotplug handling synchronous but that led to possible deadlock scenarios. Reverted. - Other minor changes including release_agent_path handling cleanup. * 'for-5.7' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup: docs: cgroup-v1: Document the cpuset_v2_mode mount option Revert "cpuset: Make cpuset hotplug synchronous" cgroupfs: Support user xattrs kernfs: Add option to enable user xattrs kernfs: Add removed_size out param for simple_xattr_set kernfs: kvmalloc xattr value instead of kmalloc cgroup: Restructure release_agent_path handling selftests/cgroup: add tests for cloning into cgroups clone3: allow spawning processes into cgroups cgroup: add cgroup_may_write() helper cgroup: refactor fork helpers cgroup: add cgroup_get_from_file() helper cgroup: unify attach permission checking cpuset: Make cpuset hotplug synchronous cgroup.c: Use built-in RCU list checking kselftest/cgroup: add cgroup destruction test cgroup: Clean up css_set task traversal
This commit is contained in:
commit
d883600523
21 changed files with 794 additions and 147 deletions
|
@ -223,6 +223,17 @@ cpu_online_mask using a CPU hotplug notifier, and the mems file
|
|||
automatically tracks the value of node_states[N_MEMORY]--i.e.,
|
||||
nodes with memory--using the cpuset_track_online_nodes() hook.
|
||||
|
||||
The cpuset.effective_cpus and cpuset.effective_mems files are
|
||||
normally read-only copies of cpuset.cpus and cpuset.mems files
|
||||
respectively. If the cpuset cgroup filesystem is mounted with the
|
||||
special "cpuset_v2_mode" option, the behavior of these files will become
|
||||
similar to the corresponding files in cpuset v2. In other words, hotplug
|
||||
events will not change cpuset.cpus and cpuset.mems. Those events will
|
||||
only affect cpuset.effective_cpus and cpuset.effective_mems which show
|
||||
the actual cpus and memory nodes that are currently used by this cpuset.
|
||||
See Documentation/admin-guide/cgroup-v2.rst for more information about
|
||||
cpuset v2 behavior.
|
||||
|
||||
|
||||
1.4 What are exclusive cpusets ?
|
||||
--------------------------------
|
||||
|
|
|
@ -53,6 +53,8 @@ static struct kernfs_iattrs *__kernfs_iattrs(struct kernfs_node *kn, int alloc)
|
|||
kn->iattr->ia_ctime = kn->iattr->ia_atime;
|
||||
|
||||
simple_xattrs_init(&kn->iattr->xattrs);
|
||||
atomic_set(&kn->iattr->nr_user_xattrs, 0);
|
||||
atomic_set(&kn->iattr->user_xattr_size, 0);
|
||||
out_unlock:
|
||||
ret = kn->iattr;
|
||||
mutex_unlock(&iattr_mutex);
|
||||
|
@ -303,7 +305,7 @@ int kernfs_xattr_set(struct kernfs_node *kn, const char *name,
|
|||
if (!attrs)
|
||||
return -ENOMEM;
|
||||
|
||||
return simple_xattr_set(&attrs->xattrs, name, value, size, flags);
|
||||
return simple_xattr_set(&attrs->xattrs, name, value, size, flags, NULL);
|
||||
}
|
||||
|
||||
static int kernfs_vfs_xattr_get(const struct xattr_handler *handler,
|
||||
|
@ -327,6 +329,86 @@ static int kernfs_vfs_xattr_set(const struct xattr_handler *handler,
|
|||
return kernfs_xattr_set(kn, name, value, size, flags);
|
||||
}
|
||||
|
||||
static int kernfs_vfs_user_xattr_add(struct kernfs_node *kn,
|
||||
const char *full_name,
|
||||
struct simple_xattrs *xattrs,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
atomic_t *sz = &kn->iattr->user_xattr_size;
|
||||
atomic_t *nr = &kn->iattr->nr_user_xattrs;
|
||||
ssize_t removed_size;
|
||||
int ret;
|
||||
|
||||
if (atomic_inc_return(nr) > KERNFS_MAX_USER_XATTRS) {
|
||||
ret = -ENOSPC;
|
||||
goto dec_count_out;
|
||||
}
|
||||
|
||||
if (atomic_add_return(size, sz) > KERNFS_USER_XATTR_SIZE_LIMIT) {
|
||||
ret = -ENOSPC;
|
||||
goto dec_size_out;
|
||||
}
|
||||
|
||||
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
|
||||
&removed_size);
|
||||
|
||||
if (!ret && removed_size >= 0)
|
||||
size = removed_size;
|
||||
else if (!ret)
|
||||
return 0;
|
||||
dec_size_out:
|
||||
atomic_sub(size, sz);
|
||||
dec_count_out:
|
||||
atomic_dec(nr);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int kernfs_vfs_user_xattr_rm(struct kernfs_node *kn,
|
||||
const char *full_name,
|
||||
struct simple_xattrs *xattrs,
|
||||
const void *value, size_t size, int flags)
|
||||
{
|
||||
atomic_t *sz = &kn->iattr->user_xattr_size;
|
||||
atomic_t *nr = &kn->iattr->nr_user_xattrs;
|
||||
ssize_t removed_size;
|
||||
int ret;
|
||||
|
||||
ret = simple_xattr_set(xattrs, full_name, value, size, flags,
|
||||
&removed_size);
|
||||
|
||||
if (removed_size >= 0) {
|
||||
atomic_sub(removed_size, sz);
|
||||
atomic_dec(nr);
|
||||
}
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int kernfs_vfs_user_xattr_set(const struct xattr_handler *handler,
|
||||
struct dentry *unused, struct inode *inode,
|
||||
const char *suffix, const void *value,
|
||||
size_t size, int flags)
|
||||
{
|
||||
const char *full_name = xattr_full_name(handler, suffix);
|
||||
struct kernfs_node *kn = inode->i_private;
|
||||
struct kernfs_iattrs *attrs;
|
||||
|
||||
if (!(kernfs_root(kn)->flags & KERNFS_ROOT_SUPPORT_USER_XATTR))
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
attrs = kernfs_iattrs(kn);
|
||||
if (!attrs)
|
||||
return -ENOMEM;
|
||||
|
||||
if (value)
|
||||
return kernfs_vfs_user_xattr_add(kn, full_name, &attrs->xattrs,
|
||||
value, size, flags);
|
||||
else
|
||||
return kernfs_vfs_user_xattr_rm(kn, full_name, &attrs->xattrs,
|
||||
value, size, flags);
|
||||
|
||||
}
|
||||
|
||||
static const struct xattr_handler kernfs_trusted_xattr_handler = {
|
||||
.prefix = XATTR_TRUSTED_PREFIX,
|
||||
.get = kernfs_vfs_xattr_get,
|
||||
|
@ -339,8 +421,15 @@ static const struct xattr_handler kernfs_security_xattr_handler = {
|
|||
.set = kernfs_vfs_xattr_set,
|
||||
};
|
||||
|
||||
static const struct xattr_handler kernfs_user_xattr_handler = {
|
||||
.prefix = XATTR_USER_PREFIX,
|
||||
.get = kernfs_vfs_xattr_get,
|
||||
.set = kernfs_vfs_user_xattr_set,
|
||||
};
|
||||
|
||||
const struct xattr_handler *kernfs_xattr_handlers[] = {
|
||||
&kernfs_trusted_xattr_handler,
|
||||
&kernfs_security_xattr_handler,
|
||||
&kernfs_user_xattr_handler,
|
||||
NULL
|
||||
};
|
||||
|
|
|
@ -26,6 +26,8 @@ struct kernfs_iattrs {
|
|||
struct timespec64 ia_ctime;
|
||||
|
||||
struct simple_xattrs xattrs;
|
||||
atomic_t nr_user_xattrs;
|
||||
atomic_t user_xattr_size;
|
||||
};
|
||||
|
||||
/* +1 to avoid triggering overflow warning when negating it */
|
||||
|
|
17
fs/xattr.c
17
fs/xattr.c
|
@ -817,7 +817,7 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size)
|
|||
if (len < sizeof(*new_xattr))
|
||||
return NULL;
|
||||
|
||||
new_xattr = kmalloc(len, GFP_KERNEL);
|
||||
new_xattr = kvmalloc(len, GFP_KERNEL);
|
||||
if (!new_xattr)
|
||||
return NULL;
|
||||
|
||||
|
@ -860,6 +860,7 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
|||
* @value: value of the xattr. If %NULL, will remove the attribute.
|
||||
* @size: size of the new xattr
|
||||
* @flags: %XATTR_{CREATE|REPLACE}
|
||||
* @removed_size: returns size of the removed xattr, -1 if none removed
|
||||
*
|
||||
* %XATTR_CREATE is set, the xattr shouldn't exist already; otherwise fails
|
||||
* with -EEXIST. If %XATTR_REPLACE is set, the xattr should exist;
|
||||
|
@ -868,7 +869,8 @@ int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
|||
* Returns 0 on success, -errno on failure.
|
||||
*/
|
||||
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||
const void *value, size_t size, int flags)
|
||||
const void *value, size_t size, int flags,
|
||||
ssize_t *removed_size)
|
||||
{
|
||||
struct simple_xattr *xattr;
|
||||
struct simple_xattr *new_xattr = NULL;
|
||||
|
@ -882,7 +884,7 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
|||
|
||||
new_xattr->name = kstrdup(name, GFP_KERNEL);
|
||||
if (!new_xattr->name) {
|
||||
kfree(new_xattr);
|
||||
kvfree(new_xattr);
|
||||
return -ENOMEM;
|
||||
}
|
||||
}
|
||||
|
@ -895,8 +897,12 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
|||
err = -EEXIST;
|
||||
} else if (new_xattr) {
|
||||
list_replace(&xattr->list, &new_xattr->list);
|
||||
if (removed_size)
|
||||
*removed_size = xattr->size;
|
||||
} else {
|
||||
list_del(&xattr->list);
|
||||
if (removed_size)
|
||||
*removed_size = xattr->size;
|
||||
}
|
||||
goto out;
|
||||
}
|
||||
|
@ -908,11 +914,14 @@ int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
|||
list_add(&new_xattr->list, &xattrs->head);
|
||||
xattr = NULL;
|
||||
}
|
||||
|
||||
if (removed_size)
|
||||
*removed_size = -1;
|
||||
out:
|
||||
spin_unlock(&xattrs->lock);
|
||||
if (xattr) {
|
||||
kfree(xattr->name);
|
||||
kfree(xattr);
|
||||
kvfree(xattr);
|
||||
}
|
||||
return err;
|
||||
|
||||
|
|
|
@ -633,8 +633,9 @@ struct cgroup_subsys {
|
|||
void (*cancel_attach)(struct cgroup_taskset *tset);
|
||||
void (*attach)(struct cgroup_taskset *tset);
|
||||
void (*post_attach)(void);
|
||||
int (*can_fork)(struct task_struct *task);
|
||||
void (*cancel_fork)(struct task_struct *task);
|
||||
int (*can_fork)(struct task_struct *task,
|
||||
struct css_set *cset);
|
||||
void (*cancel_fork)(struct task_struct *task, struct css_set *cset);
|
||||
void (*fork)(struct task_struct *task);
|
||||
void (*exit)(struct task_struct *task);
|
||||
void (*release)(struct task_struct *task);
|
||||
|
|
|
@ -27,6 +27,8 @@
|
|||
|
||||
#include <linux/cgroup-defs.h>
|
||||
|
||||
struct kernel_clone_args;
|
||||
|
||||
#ifdef CONFIG_CGROUPS
|
||||
|
||||
/*
|
||||
|
@ -58,9 +60,6 @@ struct css_task_iter {
|
|||
struct list_head *tcset_head;
|
||||
|
||||
struct list_head *task_pos;
|
||||
struct list_head *tasks_head;
|
||||
struct list_head *mg_tasks_head;
|
||||
struct list_head *dying_tasks_head;
|
||||
|
||||
struct list_head *cur_tasks_head;
|
||||
struct css_set *cur_cset;
|
||||
|
@ -122,9 +121,12 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
struct pid *pid, struct task_struct *tsk);
|
||||
|
||||
void cgroup_fork(struct task_struct *p);
|
||||
extern int cgroup_can_fork(struct task_struct *p);
|
||||
extern void cgroup_cancel_fork(struct task_struct *p);
|
||||
extern void cgroup_post_fork(struct task_struct *p);
|
||||
extern int cgroup_can_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs);
|
||||
extern void cgroup_cancel_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs);
|
||||
extern void cgroup_post_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs);
|
||||
void cgroup_exit(struct task_struct *p);
|
||||
void cgroup_release(struct task_struct *p);
|
||||
void cgroup_free(struct task_struct *p);
|
||||
|
@ -708,9 +710,12 @@ static inline int cgroupstats_build(struct cgroupstats *stats,
|
|||
struct dentry *dentry) { return -EINVAL; }
|
||||
|
||||
static inline void cgroup_fork(struct task_struct *p) {}
|
||||
static inline int cgroup_can_fork(struct task_struct *p) { return 0; }
|
||||
static inline void cgroup_cancel_fork(struct task_struct *p) {}
|
||||
static inline void cgroup_post_fork(struct task_struct *p) {}
|
||||
static inline int cgroup_can_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs) { return 0; }
|
||||
static inline void cgroup_cancel_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs) {}
|
||||
static inline void cgroup_post_fork(struct task_struct *p,
|
||||
struct kernel_clone_args *kargs) {}
|
||||
static inline void cgroup_exit(struct task_struct *p) {}
|
||||
static inline void cgroup_release(struct task_struct *p) {}
|
||||
static inline void cgroup_free(struct task_struct *p) {}
|
||||
|
|
|
@ -37,8 +37,10 @@ enum kernfs_node_type {
|
|||
KERNFS_LINK = 0x0004,
|
||||
};
|
||||
|
||||
#define KERNFS_TYPE_MASK 0x000f
|
||||
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
|
||||
#define KERNFS_TYPE_MASK 0x000f
|
||||
#define KERNFS_FLAG_MASK ~KERNFS_TYPE_MASK
|
||||
#define KERNFS_MAX_USER_XATTRS 128
|
||||
#define KERNFS_USER_XATTR_SIZE_LIMIT (128 << 10)
|
||||
|
||||
enum kernfs_node_flag {
|
||||
KERNFS_ACTIVATED = 0x0010,
|
||||
|
@ -78,6 +80,11 @@ enum kernfs_root_flag {
|
|||
* fhandle to access nodes of the fs.
|
||||
*/
|
||||
KERNFS_ROOT_SUPPORT_EXPORTOP = 0x0004,
|
||||
|
||||
/*
|
||||
* Support user xattrs to be written to nodes rooted at this root.
|
||||
*/
|
||||
KERNFS_ROOT_SUPPORT_USER_XATTR = 0x0008,
|
||||
};
|
||||
|
||||
/* type-specific structures for kernfs_node union members */
|
||||
|
|
|
@ -13,6 +13,7 @@
|
|||
struct task_struct;
|
||||
struct rusage;
|
||||
union thread_union;
|
||||
struct css_set;
|
||||
|
||||
/* All the bits taken by the old clone syscall. */
|
||||
#define CLONE_LEGACY_FLAGS 0xffffffffULL
|
||||
|
@ -29,6 +30,9 @@ struct kernel_clone_args {
|
|||
pid_t *set_tid;
|
||||
/* Number of elements in *set_tid */
|
||||
size_t set_tid_size;
|
||||
int cgroup;
|
||||
struct cgroup *cgrp;
|
||||
struct css_set *cset;
|
||||
};
|
||||
|
||||
/*
|
||||
|
|
|
@ -102,7 +102,8 @@ struct simple_xattr *simple_xattr_alloc(const void *value, size_t size);
|
|||
int simple_xattr_get(struct simple_xattrs *xattrs, const char *name,
|
||||
void *buffer, size_t size);
|
||||
int simple_xattr_set(struct simple_xattrs *xattrs, const char *name,
|
||||
const void *value, size_t size, int flags);
|
||||
const void *value, size_t size, int flags,
|
||||
ssize_t *removed_size);
|
||||
ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, char *buffer,
|
||||
size_t size);
|
||||
void simple_xattr_list_add(struct simple_xattrs *xattrs,
|
||||
|
|
|
@ -35,6 +35,7 @@
|
|||
|
||||
/* Flags for the clone3() syscall. */
|
||||
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
|
||||
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||
|
||||
/*
|
||||
* cloning flags intersect with CSIGNAL so can be used with unshare and clone3
|
||||
|
@ -81,6 +82,8 @@
|
|||
* @set_tid_size: This defines the size of the array referenced
|
||||
* in @set_tid. This cannot be larger than the
|
||||
* kernel's limit of nested PID namespaces.
|
||||
* @cgroup: If CLONE_INTO_CGROUP is specified set this to
|
||||
* a file descriptor for the cgroup.
|
||||
*
|
||||
* The structure is versioned by size and thus extensible.
|
||||
* New struct members must go at the end of the struct and
|
||||
|
@ -97,11 +100,13 @@ struct clone_args {
|
|||
__aligned_u64 tls;
|
||||
__aligned_u64 set_tid;
|
||||
__aligned_u64 set_tid_size;
|
||||
__aligned_u64 cgroup;
|
||||
};
|
||||
#endif
|
||||
|
||||
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
|
||||
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
|
||||
#define CLONE_ARGS_SIZE_VER2 88 /* sizeof third published struct */
|
||||
|
||||
/*
|
||||
* Scheduling policies
|
||||
|
|
|
@ -38,10 +38,7 @@ static bool cgroup_no_v1_named;
|
|||
*/
|
||||
static struct workqueue_struct *cgroup_pidlist_destroy_wq;
|
||||
|
||||
/*
|
||||
* Protects cgroup_subsys->release_agent_path. Modifying it also requires
|
||||
* cgroup_mutex. Reading requires either cgroup_mutex or this spinlock.
|
||||
*/
|
||||
/* protects cgroup_subsys->release_agent_path */
|
||||
static DEFINE_SPINLOCK(release_agent_path_lock);
|
||||
|
||||
bool cgroup1_ssid_disabled(int ssid)
|
||||
|
@ -775,22 +772,29 @@ void cgroup1_release_agent(struct work_struct *work)
|
|||
{
|
||||
struct cgroup *cgrp =
|
||||
container_of(work, struct cgroup, release_agent_work);
|
||||
char *pathbuf = NULL, *agentbuf = NULL;
|
||||
char *pathbuf, *agentbuf;
|
||||
char *argv[3], *envp[3];
|
||||
int ret;
|
||||
|
||||
mutex_lock(&cgroup_mutex);
|
||||
/* snoop agent path and exit early if empty */
|
||||
if (!cgrp->root->release_agent_path[0])
|
||||
return;
|
||||
|
||||
/* prepare argument buffers */
|
||||
pathbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||
agentbuf = kstrdup(cgrp->root->release_agent_path, GFP_KERNEL);
|
||||
if (!pathbuf || !agentbuf || !strlen(agentbuf))
|
||||
goto out;
|
||||
agentbuf = kmalloc(PATH_MAX, GFP_KERNEL);
|
||||
if (!pathbuf || !agentbuf)
|
||||
goto out_free;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
ret = cgroup_path_ns_locked(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
spin_lock(&release_agent_path_lock);
|
||||
strlcpy(agentbuf, cgrp->root->release_agent_path, PATH_MAX);
|
||||
spin_unlock(&release_agent_path_lock);
|
||||
if (!agentbuf[0])
|
||||
goto out_free;
|
||||
|
||||
ret = cgroup_path_ns(cgrp, pathbuf, PATH_MAX, &init_cgroup_ns);
|
||||
if (ret < 0 || ret >= PATH_MAX)
|
||||
goto out;
|
||||
goto out_free;
|
||||
|
||||
argv[0] = agentbuf;
|
||||
argv[1] = pathbuf;
|
||||
|
@ -801,11 +805,7 @@ void cgroup1_release_agent(struct work_struct *work)
|
|||
envp[1] = "PATH=/sbin:/bin:/usr/sbin:/usr/bin";
|
||||
envp[2] = NULL;
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
call_usermodehelper(argv[0], argv, envp, UMH_WAIT_EXEC);
|
||||
goto out_free;
|
||||
out:
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
out_free:
|
||||
kfree(agentbuf);
|
||||
kfree(pathbuf);
|
||||
|
|
|
@ -1966,7 +1966,8 @@ int cgroup_setup_root(struct cgroup_root *root, u16 ss_mask)
|
|||
|
||||
root->kf_root = kernfs_create_root(kf_sops,
|
||||
KERNFS_ROOT_CREATE_DEACTIVATED |
|
||||
KERNFS_ROOT_SUPPORT_EXPORTOP,
|
||||
KERNFS_ROOT_SUPPORT_EXPORTOP |
|
||||
KERNFS_ROOT_SUPPORT_USER_XATTR,
|
||||
root_cgrp);
|
||||
if (IS_ERR(root->kf_root)) {
|
||||
ret = PTR_ERR(root->kf_root);
|
||||
|
@ -2726,11 +2727,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
|
|||
{
|
||||
DEFINE_CGROUP_MGCTX(mgctx);
|
||||
struct task_struct *task;
|
||||
int ret;
|
||||
|
||||
ret = cgroup_migrate_vet_dst(dst_cgrp);
|
||||
if (ret)
|
||||
return ret;
|
||||
int ret = 0;
|
||||
|
||||
/* look up all src csets */
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
@ -4160,7 +4157,8 @@ struct cgroup_subsys_state *css_next_child(struct cgroup_subsys_state *pos,
|
|||
} else if (likely(!(pos->flags & CSS_RELEASED))) {
|
||||
next = list_entry_rcu(pos->sibling.next, struct cgroup_subsys_state, sibling);
|
||||
} else {
|
||||
list_for_each_entry_rcu(next, &parent->children, sibling)
|
||||
list_for_each_entry_rcu(next, &parent->children, sibling,
|
||||
lockdep_is_held(&cgroup_mutex))
|
||||
if (next->serial_nr > pos->serial_nr)
|
||||
break;
|
||||
}
|
||||
|
@ -4403,29 +4401,24 @@ static void css_task_iter_advance_css_set(struct css_task_iter *it)
|
|||
|
||||
lockdep_assert_held(&css_set_lock);
|
||||
|
||||
/* Advance to the next non-empty css_set */
|
||||
do {
|
||||
cset = css_task_iter_next_css_set(it);
|
||||
if (!cset) {
|
||||
it->task_pos = NULL;
|
||||
return;
|
||||
/* Advance to the next non-empty css_set and find first non-empty tasks list*/
|
||||
while ((cset = css_task_iter_next_css_set(it))) {
|
||||
if (!list_empty(&cset->tasks)) {
|
||||
it->cur_tasks_head = &cset->tasks;
|
||||
break;
|
||||
} else if (!list_empty(&cset->mg_tasks)) {
|
||||
it->cur_tasks_head = &cset->mg_tasks;
|
||||
break;
|
||||
} else if (!list_empty(&cset->dying_tasks)) {
|
||||
it->cur_tasks_head = &cset->dying_tasks;
|
||||
break;
|
||||
}
|
||||
} while (!css_set_populated(cset) && list_empty(&cset->dying_tasks));
|
||||
|
||||
if (!list_empty(&cset->tasks)) {
|
||||
it->task_pos = cset->tasks.next;
|
||||
it->cur_tasks_head = &cset->tasks;
|
||||
} else if (!list_empty(&cset->mg_tasks)) {
|
||||
it->task_pos = cset->mg_tasks.next;
|
||||
it->cur_tasks_head = &cset->mg_tasks;
|
||||
} else {
|
||||
it->task_pos = cset->dying_tasks.next;
|
||||
it->cur_tasks_head = &cset->dying_tasks;
|
||||
}
|
||||
|
||||
it->tasks_head = &cset->tasks;
|
||||
it->mg_tasks_head = &cset->mg_tasks;
|
||||
it->dying_tasks_head = &cset->dying_tasks;
|
||||
if (!cset) {
|
||||
it->task_pos = NULL;
|
||||
return;
|
||||
}
|
||||
it->task_pos = it->cur_tasks_head->next;
|
||||
|
||||
/*
|
||||
* We don't keep css_sets locked across iteration steps and thus
|
||||
|
@ -4470,24 +4463,24 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
|||
repeat:
|
||||
if (it->task_pos) {
|
||||
/*
|
||||
* Advance iterator to find next entry. cset->tasks is
|
||||
* consumed first and then ->mg_tasks. After ->mg_tasks,
|
||||
* we move onto the next cset.
|
||||
* Advance iterator to find next entry. We go through cset
|
||||
* tasks, mg_tasks and dying_tasks, when consumed we move onto
|
||||
* the next cset.
|
||||
*/
|
||||
if (it->flags & CSS_TASK_ITER_SKIPPED)
|
||||
it->flags &= ~CSS_TASK_ITER_SKIPPED;
|
||||
else
|
||||
it->task_pos = it->task_pos->next;
|
||||
|
||||
if (it->task_pos == it->tasks_head) {
|
||||
it->task_pos = it->mg_tasks_head->next;
|
||||
it->cur_tasks_head = it->mg_tasks_head;
|
||||
if (it->task_pos == &it->cur_cset->tasks) {
|
||||
it->cur_tasks_head = &it->cur_cset->mg_tasks;
|
||||
it->task_pos = it->cur_tasks_head->next;
|
||||
}
|
||||
if (it->task_pos == it->mg_tasks_head) {
|
||||
it->task_pos = it->dying_tasks_head->next;
|
||||
it->cur_tasks_head = it->dying_tasks_head;
|
||||
if (it->task_pos == &it->cur_cset->mg_tasks) {
|
||||
it->cur_tasks_head = &it->cur_cset->dying_tasks;
|
||||
it->task_pos = it->cur_tasks_head->next;
|
||||
}
|
||||
if (it->task_pos == it->dying_tasks_head)
|
||||
if (it->task_pos == &it->cur_cset->dying_tasks)
|
||||
css_task_iter_advance_css_set(it);
|
||||
} else {
|
||||
/* called from start, proceed to the first cset */
|
||||
|
@ -4505,12 +4498,12 @@ static void css_task_iter_advance(struct css_task_iter *it)
|
|||
goto repeat;
|
||||
|
||||
/* and dying leaders w/o live member threads */
|
||||
if (it->cur_tasks_head == it->dying_tasks_head &&
|
||||
if (it->cur_tasks_head == &it->cur_cset->dying_tasks &&
|
||||
!atomic_read(&task->signal->live))
|
||||
goto repeat;
|
||||
} else {
|
||||
/* skip all dying ones */
|
||||
if (it->cur_tasks_head == it->dying_tasks_head)
|
||||
if (it->cur_tasks_head == &it->cur_cset->dying_tasks)
|
||||
goto repeat;
|
||||
}
|
||||
}
|
||||
|
@ -4674,13 +4667,28 @@ static int cgroup_procs_show(struct seq_file *s, void *v)
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int cgroup_may_write(const struct cgroup *cgrp, struct super_block *sb)
|
||||
{
|
||||
int ret;
|
||||
struct inode *inode;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
||||
inode = kernfs_get_inode(sb, cgrp->procs_file.kn);
|
||||
if (!inode)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = inode_permission(inode, MAY_WRITE);
|
||||
iput(inode);
|
||||
return ret;
|
||||
}
|
||||
|
||||
static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
||||
struct cgroup *dst_cgrp,
|
||||
struct super_block *sb)
|
||||
{
|
||||
struct cgroup_namespace *ns = current->nsproxy->cgroup_ns;
|
||||
struct cgroup *com_cgrp = src_cgrp;
|
||||
struct inode *inode;
|
||||
int ret;
|
||||
|
||||
lockdep_assert_held(&cgroup_mutex);
|
||||
|
@ -4690,12 +4698,7 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
|||
com_cgrp = cgroup_parent(com_cgrp);
|
||||
|
||||
/* %current should be authorized to migrate to the common ancestor */
|
||||
inode = kernfs_get_inode(sb, com_cgrp->procs_file.kn);
|
||||
if (!inode)
|
||||
return -ENOMEM;
|
||||
|
||||
ret = inode_permission(inode, MAY_WRITE);
|
||||
iput(inode);
|
||||
ret = cgroup_may_write(com_cgrp, sb);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
|
@ -4711,6 +4714,26 @@ static int cgroup_procs_write_permission(struct cgroup *src_cgrp,
|
|||
return 0;
|
||||
}
|
||||
|
||||
static int cgroup_attach_permissions(struct cgroup *src_cgrp,
|
||||
struct cgroup *dst_cgrp,
|
||||
struct super_block *sb, bool threadgroup)
|
||||
{
|
||||
int ret = 0;
|
||||
|
||||
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp, sb);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = cgroup_migrate_vet_dst(dst_cgrp);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
if (!threadgroup && (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp))
|
||||
ret = -EOPNOTSUPP;
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
|
||||
char *buf, size_t nbytes, loff_t off)
|
||||
{
|
||||
|
@ -4733,8 +4756,8 @@ static ssize_t cgroup_procs_write(struct kernfs_open_file *of,
|
|||
src_cgrp = task_cgroup_from_root(task, &cgrp_dfl_root);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
|
||||
of->file->f_path.dentry->d_sb);
|
||||
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
|
||||
of->file->f_path.dentry->d_sb, true);
|
||||
if (ret)
|
||||
goto out_finish;
|
||||
|
||||
|
@ -4778,16 +4801,11 @@ static ssize_t cgroup_threads_write(struct kernfs_open_file *of,
|
|||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
/* thread migrations follow the cgroup.procs delegation rule */
|
||||
ret = cgroup_procs_write_permission(src_cgrp, dst_cgrp,
|
||||
of->file->f_path.dentry->d_sb);
|
||||
ret = cgroup_attach_permissions(src_cgrp, dst_cgrp,
|
||||
of->file->f_path.dentry->d_sb, false);
|
||||
if (ret)
|
||||
goto out_finish;
|
||||
|
||||
/* and must be contained in the same domain */
|
||||
ret = -EOPNOTSUPP;
|
||||
if (src_cgrp->dom_cgrp != dst_cgrp->dom_cgrp)
|
||||
goto out_finish;
|
||||
|
||||
ret = cgroup_attach_task(dst_cgrp, task, false);
|
||||
|
||||
out_finish:
|
||||
|
@ -5876,8 +5894,7 @@ int proc_cgroup_show(struct seq_file *m, struct pid_namespace *ns,
|
|||
* @child: pointer to task_struct of forking parent process.
|
||||
*
|
||||
* A task is associated with the init_css_set until cgroup_post_fork()
|
||||
* attaches it to the parent's css_set. Empty cg_list indicates that
|
||||
* @child isn't holding reference to its css_set.
|
||||
* attaches it to the target css_set.
|
||||
*/
|
||||
void cgroup_fork(struct task_struct *child)
|
||||
{
|
||||
|
@ -5885,21 +5902,172 @@ void cgroup_fork(struct task_struct *child)
|
|||
INIT_LIST_HEAD(&child->cg_list);
|
||||
}
|
||||
|
||||
static struct cgroup *cgroup_get_from_file(struct file *f)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cgroup *cgrp;
|
||||
|
||||
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
|
||||
if (IS_ERR(css))
|
||||
return ERR_CAST(css);
|
||||
|
||||
cgrp = css->cgroup;
|
||||
if (!cgroup_on_dfl(cgrp)) {
|
||||
cgroup_put(cgrp);
|
||||
return ERR_PTR(-EBADF);
|
||||
}
|
||||
|
||||
return cgrp;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_css_set_fork - find or create a css_set for a child process
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* This functions finds or creates a new css_set which the child
|
||||
* process will be attached to in cgroup_post_fork(). By default,
|
||||
* the child process will be given the same css_set as its parent.
|
||||
*
|
||||
* If CLONE_INTO_CGROUP is specified this function will try to find an
|
||||
* existing css_set which includes the requested cgroup and if not create
|
||||
* a new css_set that the child will be attached to later. If this function
|
||||
* succeeds it will hold cgroup_threadgroup_rwsem on return. If
|
||||
* CLONE_INTO_CGROUP is requested this function will grab cgroup mutex
|
||||
* before grabbing cgroup_threadgroup_rwsem and will hold a reference
|
||||
* to the target cgroup.
|
||||
*/
|
||||
static int cgroup_css_set_fork(struct kernel_clone_args *kargs)
|
||||
__acquires(&cgroup_mutex) __acquires(&cgroup_threadgroup_rwsem)
|
||||
{
|
||||
int ret;
|
||||
struct cgroup *dst_cgrp = NULL;
|
||||
struct css_set *cset;
|
||||
struct super_block *sb;
|
||||
struct file *f;
|
||||
|
||||
if (kargs->flags & CLONE_INTO_CGROUP)
|
||||
mutex_lock(&cgroup_mutex);
|
||||
|
||||
cgroup_threadgroup_change_begin(current);
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
cset = task_css_set(current);
|
||||
get_css_set(cset);
|
||||
spin_unlock_irq(&css_set_lock);
|
||||
|
||||
if (!(kargs->flags & CLONE_INTO_CGROUP)) {
|
||||
kargs->cset = cset;
|
||||
return 0;
|
||||
}
|
||||
|
||||
f = fget_raw(kargs->cgroup);
|
||||
if (!f) {
|
||||
ret = -EBADF;
|
||||
goto err;
|
||||
}
|
||||
sb = f->f_path.dentry->d_sb;
|
||||
|
||||
dst_cgrp = cgroup_get_from_file(f);
|
||||
if (IS_ERR(dst_cgrp)) {
|
||||
ret = PTR_ERR(dst_cgrp);
|
||||
dst_cgrp = NULL;
|
||||
goto err;
|
||||
}
|
||||
|
||||
if (cgroup_is_dead(dst_cgrp)) {
|
||||
ret = -ENODEV;
|
||||
goto err;
|
||||
}
|
||||
|
||||
/*
|
||||
* Verify that we the target cgroup is writable for us. This is
|
||||
* usually done by the vfs layer but since we're not going through
|
||||
* the vfs layer here we need to do it "manually".
|
||||
*/
|
||||
ret = cgroup_may_write(dst_cgrp, sb);
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
ret = cgroup_attach_permissions(cset->dfl_cgrp, dst_cgrp, sb,
|
||||
!(kargs->flags & CLONE_THREAD));
|
||||
if (ret)
|
||||
goto err;
|
||||
|
||||
kargs->cset = find_css_set(cset, dst_cgrp);
|
||||
if (!kargs->cset) {
|
||||
ret = -ENOMEM;
|
||||
goto err;
|
||||
}
|
||||
|
||||
put_css_set(cset);
|
||||
fput(f);
|
||||
kargs->cgrp = dst_cgrp;
|
||||
return ret;
|
||||
|
||||
err:
|
||||
cgroup_threadgroup_change_end(current);
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
if (f)
|
||||
fput(f);
|
||||
if (dst_cgrp)
|
||||
cgroup_put(dst_cgrp);
|
||||
put_css_set(cset);
|
||||
if (kargs->cset)
|
||||
put_css_set(kargs->cset);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_css_set_put_fork - drop references we took during fork
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* Drop references to the prepared css_set and target cgroup if
|
||||
* CLONE_INTO_CGROUP was requested.
|
||||
*/
|
||||
static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
|
||||
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||
{
|
||||
cgroup_threadgroup_change_end(current);
|
||||
|
||||
if (kargs->flags & CLONE_INTO_CGROUP) {
|
||||
struct cgroup *cgrp = kargs->cgrp;
|
||||
struct css_set *cset = kargs->cset;
|
||||
|
||||
mutex_unlock(&cgroup_mutex);
|
||||
|
||||
if (cset) {
|
||||
put_css_set(cset);
|
||||
kargs->cset = NULL;
|
||||
}
|
||||
|
||||
if (cgrp) {
|
||||
cgroup_put(cgrp);
|
||||
kargs->cgrp = NULL;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_can_fork - called on a new task before the process is exposed
|
||||
* @child: the task in question.
|
||||
* @child: the child process
|
||||
*
|
||||
* This calls the subsystem can_fork() callbacks. If the can_fork() callback
|
||||
* returns an error, the fork aborts with that error code. This allows for
|
||||
* a cgroup subsystem to conditionally allow or deny new forks.
|
||||
* This prepares a new css_set for the child process which the child will
|
||||
* be attached to in cgroup_post_fork().
|
||||
* This calls the subsystem can_fork() callbacks. If the cgroup_can_fork()
|
||||
* callback returns an error, the fork aborts with that error code. This
|
||||
* allows for a cgroup subsystem to conditionally allow or deny new forks.
|
||||
*/
|
||||
int cgroup_can_fork(struct task_struct *child)
|
||||
int cgroup_can_fork(struct task_struct *child, struct kernel_clone_args *kargs)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i, j, ret;
|
||||
|
||||
ret = cgroup_css_set_fork(kargs);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
do_each_subsys_mask(ss, i, have_canfork_callback) {
|
||||
ret = ss->can_fork(child);
|
||||
ret = ss->can_fork(child, kargs->cset);
|
||||
if (ret)
|
||||
goto out_revert;
|
||||
} while_each_subsys_mask();
|
||||
|
@ -5911,54 +6079,64 @@ int cgroup_can_fork(struct task_struct *child)
|
|||
if (j >= i)
|
||||
break;
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child);
|
||||
ss->cancel_fork(child, kargs->cset);
|
||||
}
|
||||
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
|
||||
return ret;
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_cancel_fork - called if a fork failed after cgroup_can_fork()
|
||||
* @child: the task in question
|
||||
* @child: the child process
|
||||
* @kargs: the arguments passed to create the child process
|
||||
*
|
||||
* This calls the cancel_fork() callbacks if a fork failed *after*
|
||||
* cgroup_can_fork() succeded.
|
||||
* cgroup_can_fork() succeded and cleans up references we took to
|
||||
* prepare a new css_set for the child process in cgroup_can_fork().
|
||||
*/
|
||||
void cgroup_cancel_fork(struct task_struct *child)
|
||||
void cgroup_cancel_fork(struct task_struct *child,
|
||||
struct kernel_clone_args *kargs)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
int i;
|
||||
|
||||
for_each_subsys(ss, i)
|
||||
if (ss->cancel_fork)
|
||||
ss->cancel_fork(child);
|
||||
ss->cancel_fork(child, kargs->cset);
|
||||
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
}
|
||||
|
||||
/**
|
||||
* cgroup_post_fork - called on a new task after adding it to the task list
|
||||
* @child: the task in question
|
||||
* cgroup_post_fork - finalize cgroup setup for the child process
|
||||
* @child: the child process
|
||||
*
|
||||
* Adds the task to the list running through its css_set if necessary and
|
||||
* call the subsystem fork() callbacks. Has to be after the task is
|
||||
* visible on the task list in case we race with the first call to
|
||||
* cgroup_task_iter_start() - to guarantee that the new task ends up on its
|
||||
* list.
|
||||
* Attach the child process to its css_set calling the subsystem fork()
|
||||
* callbacks.
|
||||
*/
|
||||
void cgroup_post_fork(struct task_struct *child)
|
||||
void cgroup_post_fork(struct task_struct *child,
|
||||
struct kernel_clone_args *kargs)
|
||||
__releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
|
||||
{
|
||||
struct cgroup_subsys *ss;
|
||||
struct css_set *cset;
|
||||
int i;
|
||||
|
||||
cset = kargs->cset;
|
||||
kargs->cset = NULL;
|
||||
|
||||
spin_lock_irq(&css_set_lock);
|
||||
|
||||
/* init tasks are special, only link regular threads */
|
||||
if (likely(child->pid)) {
|
||||
WARN_ON_ONCE(!list_empty(&child->cg_list));
|
||||
cset = task_css_set(current); /* current is @child's parent */
|
||||
get_css_set(cset);
|
||||
cset->nr_tasks++;
|
||||
css_set_move_task(child, NULL, cset, false);
|
||||
} else {
|
||||
put_css_set(cset);
|
||||
cset = NULL;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -5990,6 +6168,17 @@ void cgroup_post_fork(struct task_struct *child)
|
|||
do_each_subsys_mask(ss, i, have_fork_callback) {
|
||||
ss->fork(child);
|
||||
} while_each_subsys_mask();
|
||||
|
||||
/* Make the new cset the root_cset of the new cgroup namespace. */
|
||||
if (kargs->flags & CLONE_NEWCGROUP) {
|
||||
struct css_set *rcset = child->nsproxy->cgroup_ns->root_cset;
|
||||
|
||||
get_css_set(cset);
|
||||
child->nsproxy->cgroup_ns->root_cset = cset;
|
||||
put_css_set(rcset);
|
||||
}
|
||||
|
||||
cgroup_css_set_put_fork(kargs);
|
||||
}
|
||||
|
||||
/**
|
||||
|
@ -6176,7 +6365,6 @@ EXPORT_SYMBOL_GPL(cgroup_get_from_path);
|
|||
*/
|
||||
struct cgroup *cgroup_get_from_fd(int fd)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct cgroup *cgrp;
|
||||
struct file *f;
|
||||
|
||||
|
@ -6184,17 +6372,8 @@ struct cgroup *cgroup_get_from_fd(int fd)
|
|||
if (!f)
|
||||
return ERR_PTR(-EBADF);
|
||||
|
||||
css = css_tryget_online_from_dir(f->f_path.dentry, NULL);
|
||||
cgrp = cgroup_get_from_file(f);
|
||||
fput(f);
|
||||
if (IS_ERR(css))
|
||||
return ERR_CAST(css);
|
||||
|
||||
cgrp = css->cgroup;
|
||||
if (!cgroup_on_dfl(cgrp)) {
|
||||
cgroup_put(cgrp);
|
||||
return ERR_PTR(-EBADF);
|
||||
}
|
||||
|
||||
return cgrp;
|
||||
}
|
||||
EXPORT_SYMBOL_GPL(cgroup_get_from_fd);
|
||||
|
|
|
@ -358,8 +358,12 @@ static DECLARE_WORK(cpuset_hotplug_work, cpuset_hotplug_workfn);
|
|||
static DECLARE_WAIT_QUEUE_HEAD(cpuset_attach_wq);
|
||||
|
||||
/*
|
||||
* Cgroup v2 behavior is used when on default hierarchy or the
|
||||
* cgroup_v2_mode flag is set.
|
||||
* Cgroup v2 behavior is used on the "cpus" and "mems" control files when
|
||||
* on default hierarchy or when the cpuset_v2_mode flag is set by mounting
|
||||
* the v1 cpuset cgroup filesystem with the "cpuset_v2_mode" mount option.
|
||||
* With v2 behavior, "cpus" and "mems" are always what the users have
|
||||
* requested and won't be changed by hotplug events. Only the effective
|
||||
* cpus or mems will be affected.
|
||||
*/
|
||||
static inline bool is_in_v2_mode(void)
|
||||
{
|
||||
|
|
|
@ -33,6 +33,7 @@
|
|||
#include <linux/atomic.h>
|
||||
#include <linux/cgroup.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/sched/task.h>
|
||||
|
||||
#define PIDS_MAX (PID_MAX_LIMIT + 1ULL)
|
||||
#define PIDS_MAX_STR "max"
|
||||
|
@ -214,13 +215,16 @@ static void pids_cancel_attach(struct cgroup_taskset *tset)
|
|||
* task_css_check(true) in pids_can_fork() and pids_cancel_fork() relies
|
||||
* on cgroup_threadgroup_change_begin() held by the copy_process().
|
||||
*/
|
||||
static int pids_can_fork(struct task_struct *task)
|
||||
static int pids_can_fork(struct task_struct *task, struct css_set *cset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
int err;
|
||||
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
if (cset)
|
||||
css = cset->subsys[pids_cgrp_id];
|
||||
else
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
pids = css_pids(css);
|
||||
err = pids_try_charge(pids, 1);
|
||||
if (err) {
|
||||
|
@ -235,12 +239,15 @@ static int pids_can_fork(struct task_struct *task)
|
|||
return err;
|
||||
}
|
||||
|
||||
static void pids_cancel_fork(struct task_struct *task)
|
||||
static void pids_cancel_fork(struct task_struct *task, struct css_set *cset)
|
||||
{
|
||||
struct cgroup_subsys_state *css;
|
||||
struct pids_cgroup *pids;
|
||||
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
if (cset)
|
||||
css = cset->subsys[pids_cgrp_id];
|
||||
else
|
||||
css = task_css_check(current, pids_cgrp_id, true);
|
||||
pids = css_pids(css);
|
||||
pids_uncharge(pids, 1);
|
||||
}
|
||||
|
|
|
@ -2176,16 +2176,15 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
INIT_LIST_HEAD(&p->thread_group);
|
||||
p->task_works = NULL;
|
||||
|
||||
cgroup_threadgroup_change_begin(current);
|
||||
/*
|
||||
* Ensure that the cgroup subsystem policies allow the new process to be
|
||||
* forked. It should be noted the the new process's css_set can be changed
|
||||
* between here and cgroup_post_fork() if an organisation operation is in
|
||||
* progress.
|
||||
*/
|
||||
retval = cgroup_can_fork(p);
|
||||
retval = cgroup_can_fork(p, args);
|
||||
if (retval)
|
||||
goto bad_fork_cgroup_threadgroup_change_end;
|
||||
goto bad_fork_put_pidfd;
|
||||
|
||||
/*
|
||||
* From this point on we must avoid any synchronous user-space
|
||||
|
@ -2290,8 +2289,7 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
write_unlock_irq(&tasklist_lock);
|
||||
|
||||
proc_fork_connector(p);
|
||||
cgroup_post_fork(p);
|
||||
cgroup_threadgroup_change_end(current);
|
||||
cgroup_post_fork(p, args);
|
||||
perf_event_fork(p);
|
||||
|
||||
trace_task_newtask(p, clone_flags);
|
||||
|
@ -2302,9 +2300,7 @@ static __latent_entropy struct task_struct *copy_process(
|
|||
bad_fork_cancel_cgroup:
|
||||
spin_unlock(¤t->sighand->siglock);
|
||||
write_unlock_irq(&tasklist_lock);
|
||||
cgroup_cancel_fork(p);
|
||||
bad_fork_cgroup_threadgroup_change_end:
|
||||
cgroup_threadgroup_change_end(current);
|
||||
cgroup_cancel_fork(p, args);
|
||||
bad_fork_put_pidfd:
|
||||
if (clone_flags & CLONE_PIDFD) {
|
||||
fput(pidfile);
|
||||
|
@ -2633,6 +2629,9 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
|||
!valid_signal(args.exit_signal)))
|
||||
return -EINVAL;
|
||||
|
||||
if ((args.flags & CLONE_INTO_CGROUP) && args.cgroup < 0)
|
||||
return -EINVAL;
|
||||
|
||||
*kargs = (struct kernel_clone_args){
|
||||
.flags = args.flags,
|
||||
.pidfd = u64_to_user_ptr(args.pidfd),
|
||||
|
@ -2643,6 +2642,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
|
|||
.stack_size = args.stack_size,
|
||||
.tls = args.tls,
|
||||
.set_tid_size = args.set_tid_size,
|
||||
.cgroup = args.cgroup,
|
||||
};
|
||||
|
||||
if (args.set_tid &&
|
||||
|
@ -2686,7 +2686,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
|
|||
static bool clone3_args_valid(struct kernel_clone_args *kargs)
|
||||
{
|
||||
/* Verify that no unknown flags are passed along. */
|
||||
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
|
||||
if (kargs->flags &
|
||||
~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND | CLONE_INTO_CGROUP))
|
||||
return false;
|
||||
|
||||
/*
|
||||
|
|
|
@ -3243,7 +3243,7 @@ static int shmem_xattr_handler_set(const struct xattr_handler *handler,
|
|||
struct shmem_inode_info *info = SHMEM_I(inode);
|
||||
|
||||
name = xattr_full_name(handler, name);
|
||||
return simple_xattr_set(&info->xattrs, name, value, size, flags);
|
||||
return simple_xattr_set(&info->xattrs, name, value, size, flags, NULL);
|
||||
}
|
||||
|
||||
static const struct xattr_handler shmem_security_xattr_handler = {
|
||||
|
|
|
@ -11,6 +11,6 @@ TEST_GEN_PROGS += test_freezer
|
|||
|
||||
include ../lib.mk
|
||||
|
||||
$(OUTPUT)/test_memcontrol: cgroup_util.c
|
||||
$(OUTPUT)/test_core: cgroup_util.c
|
||||
$(OUTPUT)/test_freezer: cgroup_util.c
|
||||
$(OUTPUT)/test_memcontrol: cgroup_util.c ../clone3/clone3_selftests.h
|
||||
$(OUTPUT)/test_core: cgroup_util.c ../clone3/clone3_selftests.h
|
||||
$(OUTPUT)/test_freezer: cgroup_util.c ../clone3/clone3_selftests.h
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <unistd.h>
|
||||
|
||||
#include "cgroup_util.h"
|
||||
#include "../clone3/clone3_selftests.h"
|
||||
|
||||
static ssize_t read_text(const char *path, char *buf, size_t max_len)
|
||||
{
|
||||
|
@ -331,12 +332,112 @@ int cg_run(const char *cgroup,
|
|||
}
|
||||
}
|
||||
|
||||
pid_t clone_into_cgroup(int cgroup_fd)
|
||||
{
|
||||
#ifdef CLONE_ARGS_SIZE_VER2
|
||||
pid_t pid;
|
||||
|
||||
struct clone_args args = {
|
||||
.flags = CLONE_INTO_CGROUP,
|
||||
.exit_signal = SIGCHLD,
|
||||
.cgroup = cgroup_fd,
|
||||
};
|
||||
|
||||
pid = sys_clone3(&args, sizeof(struct clone_args));
|
||||
/*
|
||||
* Verify that this is a genuine test failure:
|
||||
* ENOSYS -> clone3() not available
|
||||
* E2BIG -> CLONE_INTO_CGROUP not available
|
||||
*/
|
||||
if (pid < 0 && (errno == ENOSYS || errno == E2BIG))
|
||||
goto pretend_enosys;
|
||||
|
||||
return pid;
|
||||
|
||||
pretend_enosys:
|
||||
#endif
|
||||
errno = ENOSYS;
|
||||
return -ENOSYS;
|
||||
}
|
||||
|
||||
int clone_reap(pid_t pid, int options)
|
||||
{
|
||||
int ret;
|
||||
siginfo_t info = {
|
||||
.si_signo = 0,
|
||||
};
|
||||
|
||||
again:
|
||||
ret = waitid(P_PID, pid, &info, options | __WALL | __WNOTHREAD);
|
||||
if (ret < 0) {
|
||||
if (errno == EINTR)
|
||||
goto again;
|
||||
return -1;
|
||||
}
|
||||
|
||||
if (options & WEXITED) {
|
||||
if (WIFEXITED(info.si_status))
|
||||
return WEXITSTATUS(info.si_status);
|
||||
}
|
||||
|
||||
if (options & WSTOPPED) {
|
||||
if (WIFSTOPPED(info.si_status))
|
||||
return WSTOPSIG(info.si_status);
|
||||
}
|
||||
|
||||
if (options & WCONTINUED) {
|
||||
if (WIFCONTINUED(info.si_status))
|
||||
return 0;
|
||||
}
|
||||
|
||||
return -1;
|
||||
}
|
||||
|
||||
int dirfd_open_opath(const char *dir)
|
||||
{
|
||||
return open(dir, O_DIRECTORY | O_CLOEXEC | O_NOFOLLOW | O_PATH);
|
||||
}
|
||||
|
||||
#define close_prot_errno(fd) \
|
||||
if (fd >= 0) { \
|
||||
int _e_ = errno; \
|
||||
close(fd); \
|
||||
errno = _e_; \
|
||||
}
|
||||
|
||||
static int clone_into_cgroup_run_nowait(const char *cgroup,
|
||||
int (*fn)(const char *cgroup, void *arg),
|
||||
void *arg)
|
||||
{
|
||||
int cgroup_fd;
|
||||
pid_t pid;
|
||||
|
||||
cgroup_fd = dirfd_open_opath(cgroup);
|
||||
if (cgroup_fd < 0)
|
||||
return -1;
|
||||
|
||||
pid = clone_into_cgroup(cgroup_fd);
|
||||
close_prot_errno(cgroup_fd);
|
||||
if (pid == 0)
|
||||
exit(fn(cgroup, arg));
|
||||
|
||||
return pid;
|
||||
}
|
||||
|
||||
int cg_run_nowait(const char *cgroup,
|
||||
int (*fn)(const char *cgroup, void *arg),
|
||||
void *arg)
|
||||
{
|
||||
int pid;
|
||||
|
||||
pid = clone_into_cgroup_run_nowait(cgroup, fn, arg);
|
||||
if (pid > 0)
|
||||
return pid;
|
||||
|
||||
/* Genuine test failure. */
|
||||
if (pid < 0 && errno != ENOSYS)
|
||||
return -1;
|
||||
|
||||
pid = fork();
|
||||
if (pid == 0) {
|
||||
char buf[64];
|
||||
|
@ -450,3 +551,28 @@ int proc_read_strstr(int pid, bool thread, const char *item, const char *needle)
|
|||
|
||||
return strstr(buf, needle) ? 0 : -1;
|
||||
}
|
||||
|
||||
int clone_into_cgroup_run_wait(const char *cgroup)
|
||||
{
|
||||
int cgroup_fd;
|
||||
pid_t pid;
|
||||
|
||||
cgroup_fd = dirfd_open_opath(cgroup);
|
||||
if (cgroup_fd < 0)
|
||||
return -1;
|
||||
|
||||
pid = clone_into_cgroup(cgroup_fd);
|
||||
close_prot_errno(cgroup_fd);
|
||||
if (pid < 0)
|
||||
return -1;
|
||||
|
||||
if (pid == 0)
|
||||
exit(EXIT_SUCCESS);
|
||||
|
||||
/*
|
||||
* We don't care whether this fails. We only care whether the initial
|
||||
* clone succeeded.
|
||||
*/
|
||||
(void)clone_reap(pid, WEXITED);
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -50,3 +50,7 @@ extern int cg_wait_for_proc_count(const char *cgroup, int count);
|
|||
extern int cg_killall(const char *cgroup);
|
||||
extern ssize_t proc_read_text(int pid, bool thread, const char *item, char *buf, size_t size);
|
||||
extern int proc_read_strstr(int pid, bool thread, const char *item, const char *needle);
|
||||
extern pid_t clone_into_cgroup(int cgroup_fd);
|
||||
extern int clone_reap(pid_t pid, int options);
|
||||
extern int clone_into_cgroup_run_wait(const char *cgroup);
|
||||
extern int dirfd_open_opath(const char *dir);
|
||||
|
|
|
@ -2,7 +2,10 @@
|
|||
|
||||
#include <linux/limits.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/mman.h>
|
||||
#include <sys/wait.h>
|
||||
#include <unistd.h>
|
||||
#include <fcntl.h>
|
||||
#include <stdio.h>
|
||||
#include <errno.h>
|
||||
#include <signal.h>
|
||||
|
@ -12,6 +15,115 @@
|
|||
#include "../kselftest.h"
|
||||
#include "cgroup_util.h"
|
||||
|
||||
static int touch_anon(char *buf, size_t size)
|
||||
{
|
||||
int fd;
|
||||
char *pos = buf;
|
||||
|
||||
fd = open("/dev/urandom", O_RDONLY);
|
||||
if (fd < 0)
|
||||
return -1;
|
||||
|
||||
while (size > 0) {
|
||||
ssize_t ret = read(fd, pos, size);
|
||||
|
||||
if (ret < 0) {
|
||||
if (errno != EINTR) {
|
||||
close(fd);
|
||||
return -1;
|
||||
}
|
||||
} else {
|
||||
pos += ret;
|
||||
size -= ret;
|
||||
}
|
||||
}
|
||||
close(fd);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
static int alloc_and_touch_anon_noexit(const char *cgroup, void *arg)
|
||||
{
|
||||
int ppid = getppid();
|
||||
size_t size = (size_t)arg;
|
||||
void *buf;
|
||||
|
||||
buf = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON,
|
||||
0, 0);
|
||||
if (buf == MAP_FAILED)
|
||||
return -1;
|
||||
|
||||
if (touch_anon((char *)buf, size)) {
|
||||
munmap(buf, size);
|
||||
return -1;
|
||||
}
|
||||
|
||||
while (getppid() == ppid)
|
||||
sleep(1);
|
||||
|
||||
munmap(buf, size);
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Create a child process that allocates and touches 100MB, then waits to be
|
||||
* killed. Wait until the child is attached to the cgroup, kill all processes
|
||||
* in that cgroup and wait until "cgroup.procs" is empty. At this point try to
|
||||
* destroy the empty cgroup. The test helps detect race conditions between
|
||||
* dying processes leaving the cgroup and cgroup destruction path.
|
||||
*/
|
||||
static int test_cgcore_destroy(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
char *cg_test = NULL;
|
||||
int child_pid;
|
||||
char buf[PAGE_SIZE];
|
||||
|
||||
cg_test = cg_name(root, "cg_test");
|
||||
|
||||
if (!cg_test)
|
||||
goto cleanup;
|
||||
|
||||
for (int i = 0; i < 10; i++) {
|
||||
if (cg_create(cg_test))
|
||||
goto cleanup;
|
||||
|
||||
child_pid = cg_run_nowait(cg_test, alloc_and_touch_anon_noexit,
|
||||
(void *) MB(100));
|
||||
|
||||
if (child_pid < 0)
|
||||
goto cleanup;
|
||||
|
||||
/* wait for the child to enter cgroup */
|
||||
if (cg_wait_for_proc_count(cg_test, 1))
|
||||
goto cleanup;
|
||||
|
||||
if (cg_killall(cg_test))
|
||||
goto cleanup;
|
||||
|
||||
/* wait for cgroup to be empty */
|
||||
while (1) {
|
||||
if (cg_read(cg_test, "cgroup.procs", buf, sizeof(buf)))
|
||||
goto cleanup;
|
||||
if (buf[0] == '\0')
|
||||
break;
|
||||
usleep(1000);
|
||||
}
|
||||
|
||||
if (rmdir(cg_test))
|
||||
goto cleanup;
|
||||
|
||||
if (waitpid(child_pid, NULL, 0) < 0)
|
||||
goto cleanup;
|
||||
}
|
||||
ret = KSFT_PASS;
|
||||
cleanup:
|
||||
if (cg_test)
|
||||
cg_destroy(cg_test);
|
||||
free(cg_test);
|
||||
return ret;
|
||||
}
|
||||
|
||||
/*
|
||||
* A(0) - B(0) - C(1)
|
||||
* \ D(0)
|
||||
|
@ -25,8 +137,11 @@
|
|||
static int test_cgcore_populated(const char *root)
|
||||
{
|
||||
int ret = KSFT_FAIL;
|
||||
int err;
|
||||
char *cg_test_a = NULL, *cg_test_b = NULL;
|
||||
char *cg_test_c = NULL, *cg_test_d = NULL;
|
||||
int cgroup_fd = -EBADF;
|
||||
pid_t pid;
|
||||
|
||||
cg_test_a = cg_name(root, "cg_test_a");
|
||||
cg_test_b = cg_name(root, "cg_test_a/cg_test_b");
|
||||
|
@ -78,6 +193,52 @@ static int test_cgcore_populated(const char *root)
|
|||
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
|
||||
goto cleanup;
|
||||
|
||||
/* Test that we can directly clone into a new cgroup. */
|
||||
cgroup_fd = dirfd_open_opath(cg_test_d);
|
||||
if (cgroup_fd < 0)
|
||||
goto cleanup;
|
||||
|
||||
pid = clone_into_cgroup(cgroup_fd);
|
||||
if (pid < 0) {
|
||||
if (errno == ENOSYS)
|
||||
goto cleanup_pass;
|
||||
goto cleanup;
|
||||
}
|
||||
|
||||
if (pid == 0) {
|
||||
if (raise(SIGSTOP))
|
||||
exit(EXIT_FAILURE);
|
||||
exit(EXIT_SUCCESS);
|
||||
}
|
||||
|
||||
err = cg_read_strcmp(cg_test_d, "cgroup.events", "populated 1\n");
|
||||
|
||||
(void)clone_reap(pid, WSTOPPED);
|
||||
(void)kill(pid, SIGCONT);
|
||||
(void)clone_reap(pid, WEXITED);
|
||||
|
||||
if (err)
|
||||
goto cleanup;
|
||||
|
||||
if (cg_read_strcmp(cg_test_d, "cgroup.events", "populated 0\n"))
|
||||
goto cleanup;
|
||||
|
||||
/* Remove cgroup. */
|
||||
if (cg_test_d) {
|
||||
cg_destroy(cg_test_d);
|
||||
free(cg_test_d);
|
||||
cg_test_d = NULL;
|
||||
}
|
||||
|
||||
pid = clone_into_cgroup(cgroup_fd);
|
||||
if (pid < 0)
|
||||
goto cleanup_pass;
|
||||
if (pid == 0)
|
||||
exit(EXIT_SUCCESS);
|
||||
(void)clone_reap(pid, WEXITED);
|
||||
goto cleanup;
|
||||
|
||||
cleanup_pass:
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
|
@ -93,6 +254,8 @@ static int test_cgcore_populated(const char *root)
|
|||
free(cg_test_c);
|
||||
free(cg_test_b);
|
||||
free(cg_test_a);
|
||||
if (cgroup_fd >= 0)
|
||||
close(cgroup_fd);
|
||||
return ret;
|
||||
}
|
||||
|
||||
|
@ -136,6 +299,16 @@ static int test_cgcore_invalid_domain(const char *root)
|
|||
if (errno != EOPNOTSUPP)
|
||||
goto cleanup;
|
||||
|
||||
if (!clone_into_cgroup_run_wait(child))
|
||||
goto cleanup;
|
||||
|
||||
if (errno == ENOSYS)
|
||||
goto cleanup_pass;
|
||||
|
||||
if (errno != EOPNOTSUPP)
|
||||
goto cleanup;
|
||||
|
||||
cleanup_pass:
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
|
@ -345,6 +518,9 @@ static int test_cgcore_internal_process_constraint(const char *root)
|
|||
if (!cg_enter_current(parent))
|
||||
goto cleanup;
|
||||
|
||||
if (!clone_into_cgroup_run_wait(parent))
|
||||
goto cleanup;
|
||||
|
||||
ret = KSFT_PASS;
|
||||
|
||||
cleanup:
|
||||
|
@ -512,6 +688,7 @@ struct corecg_test {
|
|||
T(test_cgcore_populated),
|
||||
T(test_cgcore_proc_migration),
|
||||
T(test_cgcore_thread_migration),
|
||||
T(test_cgcore_destroy),
|
||||
};
|
||||
#undef T
|
||||
|
||||
|
|
|
@ -5,12 +5,24 @@
|
|||
|
||||
#define _GNU_SOURCE
|
||||
#include <sched.h>
|
||||
#include <linux/sched.h>
|
||||
#include <linux/types.h>
|
||||
#include <stdint.h>
|
||||
#include <syscall.h>
|
||||
#include <linux/types.h>
|
||||
#include <sys/wait.h>
|
||||
|
||||
#include "../kselftest.h"
|
||||
|
||||
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
|
||||
|
||||
#ifndef CLONE_INTO_CGROUP
|
||||
#define CLONE_INTO_CGROUP 0x200000000ULL /* Clone into a specific cgroup given the right permissions. */
|
||||
#endif
|
||||
|
||||
#ifndef CLONE_ARGS_SIZE_VER0
|
||||
#define CLONE_ARGS_SIZE_VER0 64
|
||||
#endif
|
||||
|
||||
#ifndef __NR_clone3
|
||||
#define __NR_clone3 -1
|
||||
struct clone_args {
|
||||
|
@ -22,10 +34,13 @@ struct clone_args {
|
|||
__aligned_u64 stack;
|
||||
__aligned_u64 stack_size;
|
||||
__aligned_u64 tls;
|
||||
#define CLONE_ARGS_SIZE_VER1 80
|
||||
__aligned_u64 set_tid;
|
||||
__aligned_u64 set_tid_size;
|
||||
#define CLONE_ARGS_SIZE_VER2 88
|
||||
__aligned_u64 cgroup;
|
||||
};
|
||||
#endif
|
||||
#endif /* __NR_clone3 */
|
||||
|
||||
static pid_t sys_clone3(struct clone_args *args, size_t size)
|
||||
{
|
||||
|
|
Loading…
Reference in a new issue