diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 5207c28c2402..50d8cc37498b 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -275,6 +275,9 @@ enum { * - "tasks" is removed. Everything should be at process * granularity. Use "cgroup.procs" instead. * + * - "cgroup.procs" is not sorted. pids will be unique unless they + * got recycled inbetween reads. + * * - "release_agent" and "notify_on_release" are removed. * Replacement notification mechanism will be implemented. * diff --git a/kernel/cgroup.c b/kernel/cgroup.c index a2458031d851..f9f5fe3526ac 100644 --- a/kernel/cgroup.c +++ b/kernel/cgroup.c @@ -3564,11 +3564,49 @@ static int pidlist_uniq(pid_t *list, int length) return dest; } +/* + * The two pid files - task and cgroup.procs - guaranteed that the result + * is sorted, which forced this whole pidlist fiasco. As pid order is + * different per namespace, each namespace needs differently sorted list, + * making it impossible to use, for example, single rbtree of member tasks + * sorted by task pointer. As pidlists can be fairly large, allocating one + * per open file is dangerous, so cgroup had to implement shared pool of + * pidlists keyed by cgroup and namespace. + * + * All this extra complexity was caused by the original implementation + * committing to an entirely unnecessary property. In the long term, we + * want to do away with it. Explicitly scramble sort order if + * sane_behavior so that no such expectation exists in the new interface. + * + * Scrambling is done by swapping every two consecutive bits, which is + * non-identity one-to-one mapping which disturbs sort order sufficiently. + */ +static pid_t pid_fry(pid_t pid) +{ + unsigned a = pid & 0x55555555; + unsigned b = pid & 0xAAAAAAAA; + + return (a << 1) | (b >> 1); +} + +static pid_t cgroup_pid_fry(struct cgroup *cgrp, pid_t pid) +{ + if (cgroup_sane_behavior(cgrp)) + return pid_fry(pid); + else + return pid; +} + static int cmppid(const void *a, const void *b) { return *(pid_t *)a - *(pid_t *)b; } +static int fried_cmppid(const void *a, const void *b) +{ + return pid_fry(*(pid_t *)a) - pid_fry(*(pid_t *)b); +} + static struct cgroup_pidlist *cgroup_pidlist_find(struct cgroup *cgrp, enum cgroup_filetype type) { @@ -3656,7 +3694,10 @@ static int pidlist_array_load(struct cgroup *cgrp, enum cgroup_filetype type, css_task_iter_end(&it); length = n; /* now sort & (if procs) strip out duplicates */ - sort(array, length, sizeof(pid_t), cmppid, NULL); + if (cgroup_sane_behavior(cgrp)) + sort(array, length, sizeof(pid_t), fried_cmppid, NULL); + else + sort(array, length, sizeof(pid_t), cmppid, NULL); if (type == CGROUP_FILE_PROCS) length = pidlist_uniq(array, length); @@ -3777,10 +3818,10 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) while (index < end) { int mid = (index + end) / 2; - if (l->list[mid] == pid) { + if (cgroup_pid_fry(cgrp, l->list[mid]) == pid) { index = mid; break; - } else if (l->list[mid] <= pid) + } else if (cgroup_pid_fry(cgrp, l->list[mid]) <= pid) index = mid + 1; else end = mid; @@ -3791,7 +3832,7 @@ static void *cgroup_pidlist_start(struct seq_file *s, loff_t *pos) return NULL; /* Update the abstract position to be the actual pid that we found */ iter = l->list + index; - *pos = *iter; + *pos = cgroup_pid_fry(cgrp, *iter); return iter; } @@ -3820,7 +3861,7 @@ static void *cgroup_pidlist_next(struct seq_file *s, void *v, loff_t *pos) if (p >= end) { return NULL; } else { - *pos = *p; + *pos = cgroup_pid_fry(of->cgrp, *p); return p; } }