diff --git a/arch/i386/kernel/syscall_table.S b/arch/i386/kernel/syscall_table.S index f7ba4acc20ec..6ff3e5243226 100644 --- a/arch/i386/kernel/syscall_table.S +++ b/arch/i386/kernel/syscall_table.S @@ -293,3 +293,4 @@ ENTRY(sys_call_table) .long sys_inotify_init .long sys_inotify_add_watch .long sys_inotify_rm_watch + .long sys_migrate_pages diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 0741b066b98f..7a6ffd613789 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1600,5 +1600,6 @@ sys_call_table: data8 sys_inotify_init data8 sys_inotify_add_watch data8 sys_inotify_rm_watch + data8 sys_migrate_pages // 1280 .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/x86_64/ia32/ia32entry.S b/arch/x86_64/ia32/ia32entry.S index df0773c9bdbe..1f0ff5adc80e 100644 --- a/arch/x86_64/ia32/ia32entry.S +++ b/arch/x86_64/ia32/ia32entry.S @@ -643,6 +643,7 @@ ia32_sys_call_table: .quad sys_inotify_init .quad sys_inotify_add_watch .quad sys_inotify_rm_watch + .quad sys_migrate_pages ia32_syscall_end: .rept IA32_NR_syscalls-(ia32_syscall_end-ia32_sys_call_table)/8 .quad ni_syscall diff --git a/include/asm-i386/unistd.h b/include/asm-i386/unistd.h index fe38b9a96233..481c3c0ea720 100644 --- a/include/asm-i386/unistd.h +++ b/include/asm-i386/unistd.h @@ -299,8 +299,9 @@ #define __NR_inotify_init 291 #define __NR_inotify_add_watch 292 #define __NR_inotify_rm_watch 293 +#define __NR_migrate_pages 294 -#define NR_syscalls 294 +#define NR_syscalls 295 /* * user-visible error numbers are in the range -1 - -128: see diff --git a/include/asm-ia64/unistd.h b/include/asm-ia64/unistd.h index 2bf543493cb8..962f9bd1bdff 100644 --- a/include/asm-ia64/unistd.h +++ b/include/asm-ia64/unistd.h @@ -269,12 +269,13 @@ #define __NR_inotify_init 1277 #define __NR_inotify_add_watch 1278 #define __NR_inotify_rm_watch 1279 +#define __NR_migrate_pages 1280 #ifdef __KERNEL__ #include -#define NR_syscalls 256 /* length of syscall table */ +#define NR_syscalls 270 /* length of syscall table */ #define __ARCH_WANT_SYS_RT_SIGACTION diff --git a/include/asm-x86_64/ia32_unistd.h b/include/asm-x86_64/ia32_unistd.h index d5166ec3868d..e8843362a6cc 100644 --- a/include/asm-x86_64/ia32_unistd.h +++ b/include/asm-x86_64/ia32_unistd.h @@ -299,7 +299,8 @@ #define __NR_ia32_inotify_init 291 #define __NR_ia32_inotify_add_watch 292 #define __NR_ia32_inotify_rm_watch 293 +#define __NR_ia32_migrate_pages 294 -#define IA32_NR_syscalls 294 /* must be > than biggest syscall! */ +#define IA32_NR_syscalls 295 /* must be > than biggest syscall! */ #endif /* _ASM_X86_64_IA32_UNISTD_H_ */ diff --git a/include/asm-x86_64/unistd.h b/include/asm-x86_64/unistd.h index 2c42150bce0c..e6f896161c11 100644 --- a/include/asm-x86_64/unistd.h +++ b/include/asm-x86_64/unistd.h @@ -571,8 +571,10 @@ __SYSCALL(__NR_inotify_init, sys_inotify_init) __SYSCALL(__NR_inotify_add_watch, sys_inotify_add_watch) #define __NR_inotify_rm_watch 255 __SYSCALL(__NR_inotify_rm_watch, sys_inotify_rm_watch) +#define __NR_migrate_pages 256 +__SYSCALL(__NR_migrate_pages, sys_migrate_pages) -#define __NR_syscall_max __NR_inotify_rm_watch +#define __NR_syscall_max __NR_migrate_pages #ifndef __NO_STUBS /* user-visible error numbers are in the range -1 - -4095 */ diff --git a/include/linux/mempolicy.h b/include/linux/mempolicy.h index 05443a766cb8..3e61e829681d 100644 --- a/include/linux/mempolicy.h +++ b/include/linux/mempolicy.h @@ -162,6 +162,9 @@ static inline void check_highest_zone(int k) policy_zone = k; } +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags); + #else struct mempolicy {}; diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index c7007b1db91d..e910d1a481df 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -511,5 +511,7 @@ asmlinkage long sys_ioprio_set(int which, int who, int ioprio); asmlinkage long sys_ioprio_get(int which, int who); asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, unsigned long maxnode); +asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *from, const unsigned long __user *to); #endif diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 1ab2370e2efa..7a8bc7f60d91 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -82,6 +82,7 @@ cond_syscall(compat_sys_socketcall); cond_syscall(sys_inotify_init); cond_syscall(sys_inotify_add_watch); cond_syscall(sys_inotify_rm_watch); +cond_syscall(sys_migrate_pages); /* arch-specific weak syscall entries */ cond_syscall(sys_pciconfig_read); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 9cc6d962831d..20d5ad39fa41 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -614,12 +614,42 @@ long do_get_mempolicy(int *policy, nodemask_t *nmask, return err; } +/* + * For now migrate_pages simply swaps out the pages from nodes that are in + * the source set but not in the target set. In the future, we would + * want a function that moves pages between the two nodesets in such + * a way as to preserve the physical layout as much as possible. + * + * Returns the number of page that could not be moved. + */ +int do_migrate_pages(struct mm_struct *mm, + const nodemask_t *from_nodes, const nodemask_t *to_nodes, int flags) +{ + LIST_HEAD(pagelist); + int count = 0; + nodemask_t nodes; + + nodes_andnot(nodes, *from_nodes, *to_nodes); + nodes_complement(nodes, nodes); + + down_read(&mm->mmap_sem); + check_range(mm, mm->mmap->vm_start, TASK_SIZE, &nodes, + flags | MPOL_MF_DISCONTIG_OK, &pagelist); + if (!list_empty(&pagelist)) { + migrate_pages(&pagelist, NULL); + if (!list_empty(&pagelist)) + count = putback_lru_pages(&pagelist); + } + up_read(&mm->mmap_sem); + return count; +} + /* * User space interface with variable sized bitmaps for nodelists. */ /* Copy a node mask from user space. */ -static int get_nodes(nodemask_t *nodes, unsigned long __user *nmask, +static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; @@ -708,6 +738,68 @@ asmlinkage long sys_set_mempolicy(int mode, unsigned long __user *nmask, return do_set_mempolicy(mode, &nodes); } +/* Macro needed until Paul implements this function in kernel/cpusets.c */ +#define cpuset_mems_allowed(task) node_online_map + +asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode, + const unsigned long __user *old_nodes, + const unsigned long __user *new_nodes) +{ + struct mm_struct *mm; + struct task_struct *task; + nodemask_t old; + nodemask_t new; + nodemask_t task_nodes; + int err; + + err = get_nodes(&old, old_nodes, maxnode); + if (err) + return err; + + err = get_nodes(&new, new_nodes, maxnode); + if (err) + return err; + + /* Find the mm_struct */ + read_lock(&tasklist_lock); + task = pid ? find_task_by_pid(pid) : current; + if (!task) { + read_unlock(&tasklist_lock); + return -ESRCH; + } + mm = get_task_mm(task); + read_unlock(&tasklist_lock); + + if (!mm) + return -EINVAL; + + /* + * Check if this process has the right to modify the specified + * process. The right exists if the process has administrative + * capabilities, superuser priviledges or the same + * userid as the target process. + */ + if ((current->euid != task->suid) && (current->euid != task->uid) && + (current->uid != task->suid) && (current->uid != task->uid) && + !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + task_nodes = cpuset_mems_allowed(task); + /* Is the user allowed to access the target nodes? */ + if (!nodes_subset(new, task_nodes) && !capable(CAP_SYS_ADMIN)) { + err = -EPERM; + goto out; + } + + err = do_migrate_pages(mm, &old, &new, MPOL_MF_MOVE); +out: + mmput(mm); + return err; +} + + /* Retrieve NUMA policy */ asmlinkage long sys_get_mempolicy(int __user *policy, unsigned long __user *nmask,