diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl index ec8bed9e7b75..ee7b01bb7346 100644 --- a/arch/alpha/kernel/syscalls/syscall.tbl +++ b/arch/alpha/kernel/syscalls/syscall.tbl @@ -479,3 +479,4 @@ 547 common openat2 sys_openat2 548 common pidfd_getfd sys_pidfd_getfd 549 common faccessat2 sys_faccessat2 +550 common process_madvise sys_process_madvise diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl index 171077cbf419..d056a548358e 100644 --- a/arch/arm/tools/syscall.tbl +++ b/arch/arm/tools/syscall.tbl @@ -453,3 +453,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/arm64/include/asm/unistd.h b/arch/arm64/include/asm/unistd.h index 3b859596840d..b3b2019f8d16 100644 --- a/arch/arm64/include/asm/unistd.h +++ b/arch/arm64/include/asm/unistd.h @@ -38,7 +38,7 @@ #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) -#define __NR_compat_syscalls 440 +#define __NR_compat_syscalls 441 #endif #define __ARCH_WANT_SYS_CLONE diff --git a/arch/arm64/include/asm/unistd32.h b/arch/arm64/include/asm/unistd32.h index 2a3ad9b9accd..107f08e03b9f 100644 --- a/arch/arm64/include/asm/unistd32.h +++ b/arch/arm64/include/asm/unistd32.h @@ -887,6 +887,8 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) /* * Please add new compat syscalls above this comment and update diff --git a/arch/ia64/kernel/syscalls/syscall.tbl b/arch/ia64/kernel/syscalls/syscall.tbl index 4799c96c325f..b96ed8b8a508 100644 --- a/arch/ia64/kernel/syscalls/syscall.tbl +++ b/arch/ia64/kernel/syscalls/syscall.tbl @@ -360,3 +360,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl index 81fc799d8392..625fb6d32842 100644 --- a/arch/m68k/kernel/syscalls/syscall.tbl +++ b/arch/m68k/kernel/syscalls/syscall.tbl @@ -439,3 +439,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl index b4e263916f41..aae729c95cf9 100644 --- a/arch/microblaze/kernel/syscalls/syscall.tbl +++ b/arch/microblaze/kernel/syscalls/syscall.tbl @@ -445,3 +445,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl index cf72a0206a87..32817c954435 100644 --- a/arch/mips/kernel/syscalls/syscall_n32.tbl +++ b/arch/mips/kernel/syscalls/syscall_n32.tbl @@ -378,3 +378,4 @@ 437 n32 openat2 sys_openat2 438 n32 pidfd_getfd sys_pidfd_getfd 439 n32 faccessat2 sys_faccessat2 +440 n32 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl index 557f9954a2b9..9e4ea3c31b1c 100644 --- a/arch/mips/kernel/syscalls/syscall_n64.tbl +++ b/arch/mips/kernel/syscalls/syscall_n64.tbl @@ -354,3 +354,4 @@ 437 n64 openat2 sys_openat2 438 n64 pidfd_getfd sys_pidfd_getfd 439 n64 faccessat2 sys_faccessat2 +440 n64 process_madvise sys_process_madvise diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl index a17aab5abeb2..29f5f28cf5ce 100644 --- a/arch/mips/kernel/syscalls/syscall_o32.tbl +++ b/arch/mips/kernel/syscalls/syscall_o32.tbl @@ -427,3 +427,4 @@ 437 o32 openat2 sys_openat2 438 o32 pidfd_getfd sys_pidfd_getfd 439 o32 faccessat2 sys_faccessat2 +440 o32 process_madvise sys_process_madvise diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl index ae3dab371f6f..38c63e5404bc 100644 --- a/arch/parisc/kernel/syscalls/syscall.tbl +++ b/arch/parisc/kernel/syscalls/syscall.tbl @@ -437,3 +437,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl index 9d7fb4ced290..1275daec7fec 100644 --- a/arch/powerpc/kernel/syscalls/syscall.tbl +++ b/arch/powerpc/kernel/syscalls/syscall.tbl @@ -529,3 +529,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl index 1c3b48165e86..28c168000483 100644 --- a/arch/s390/kernel/syscalls/syscall.tbl +++ b/arch/s390/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise sys_process_madvise diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl index ae0a00beea5f..783738448ff5 100644 --- a/arch/sh/kernel/syscalls/syscall.tbl +++ b/arch/sh/kernel/syscalls/syscall.tbl @@ -442,3 +442,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl index 37ec52b34c73..78160260991b 100644 --- a/arch/sparc/kernel/syscalls/syscall.tbl +++ b/arch/sparc/kernel/syscalls/syscall.tbl @@ -485,3 +485,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl index 9b6931f8d555..0d0667a9fbd7 100644 --- a/arch/x86/entry/syscalls/syscall_32.tbl +++ b/arch/x86/entry/syscalls/syscall_32.tbl @@ -444,3 +444,4 @@ 437 i386 openat2 sys_openat2 438 i386 pidfd_getfd sys_pidfd_getfd 439 i386 faccessat2 sys_faccessat2 +440 i386 process_madvise sys_process_madvise diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl index 347809649ba2..1f47e24fb65c 100644 --- a/arch/x86/entry/syscalls/syscall_64.tbl +++ b/arch/x86/entry/syscalls/syscall_64.tbl @@ -361,6 +361,7 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise # # x32-specific system call numbers start at 512 to avoid cache impact diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl index 6276e3c2d3fc..b070f272995d 100644 --- a/arch/xtensa/kernel/syscalls/syscall.tbl +++ b/arch/xtensa/kernel/syscalls/syscall.tbl @@ -410,3 +410,4 @@ 437 common openat2 sys_openat2 438 common pidfd_getfd sys_pidfd_getfd 439 common faccessat2 sys_faccessat2 +440 common process_madvise sys_process_madvise diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h index 06db09875aa4..2eda7678fe1d 100644 --- a/include/linux/syscalls.h +++ b/include/linux/syscalls.h @@ -879,6 +879,8 @@ asmlinkage long sys_munlockall(void); asmlinkage long sys_mincore(unsigned long start, size_t len, unsigned char __user * vec); asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); +asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, unsigned long prot, unsigned long pgoff, unsigned long flags); diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h index f2b5d72a46c2..2056318988f7 100644 --- a/include/uapi/asm-generic/unistd.h +++ b/include/uapi/asm-generic/unistd.h @@ -857,9 +857,11 @@ __SYSCALL(__NR_openat2, sys_openat2) __SYSCALL(__NR_pidfd_getfd, sys_pidfd_getfd) #define __NR_faccessat2 439 __SYSCALL(__NR_faccessat2, sys_faccessat2) +#define __NR_process_madvise 440 +__SYSCALL(__NR_process_madvise, sys_process_madvise) #undef __NR_syscalls -#define __NR_syscalls 440 +#define __NR_syscalls 441 /* * 32 bit systems traditionally used different diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index c925d1e1777e..f27ac94d5fa7 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -280,6 +280,7 @@ COND_SYSCALL(mlockall); COND_SYSCALL(munlockall); COND_SYSCALL(mincore); COND_SYSCALL(madvise); +COND_SYSCALL(process_madvise); COND_SYSCALL(remap_file_pages); COND_SYSCALL(mbind); COND_SYSCALL_COMPAT(mbind); diff --git a/mm/madvise.c b/mm/madvise.c index d550ef045288..416a56b8e757 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -17,6 +17,8 @@ #include #include #include +#include +#include #include #include #include @@ -27,7 +29,6 @@ #include #include #include -#include #include @@ -988,6 +989,18 @@ madvise_behavior_valid(int behavior) } } +static bool +process_madvise_behavior_valid(int behavior) +{ + switch (behavior) { + case MADV_COLD: + case MADV_PAGEOUT: + return true; + default: + return false; + } +} + /* * The madvise(2) system call. * @@ -1035,6 +1048,11 @@ madvise_behavior_valid(int behavior) * MADV_DONTDUMP - the application wants to prevent pages in the given range * from being included in its core dump. * MADV_DODUMP - cancel MADV_DONTDUMP: no longer exclude from core dump. + * MADV_COLD - the application is not expected to use this memory soon, + * deactivate pages in this range so that they can be reclaimed + * easily if memory pressure hanppens. + * MADV_PAGEOUT - the application is not expected to use this memory soon, + * page out the pages in this range immediately. * * return values: * zero - success @@ -1151,3 +1169,76 @@ SYSCALL_DEFINE3(madvise, unsigned long, start, size_t, len_in, int, behavior) { return do_madvise(current->mm, start, len_in, behavior); } + +SYSCALL_DEFINE5(process_madvise, int, pidfd, const struct iovec __user *, vec, + size_t, vlen, int, behavior, unsigned int, flags) +{ + ssize_t ret; + struct iovec iovstack[UIO_FASTIOV], iovec; + struct iovec *iov = iovstack; + struct iov_iter iter; + struct pid *pid; + struct task_struct *task; + struct mm_struct *mm; + size_t total_len; + unsigned int f_flags; + + if (flags != 0) { + ret = -EINVAL; + goto out; + } + + ret = import_iovec(READ, vec, vlen, ARRAY_SIZE(iovstack), &iov, &iter); + if (ret < 0) + goto out; + + pid = pidfd_get_pid(pidfd, &f_flags); + if (IS_ERR(pid)) { + ret = PTR_ERR(pid); + goto free_iov; + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + ret = -ESRCH; + goto put_pid; + } + + if (task->mm != current->mm && + !process_madvise_behavior_valid(behavior)) { + ret = -EINVAL; + goto release_task; + } + + mm = mm_access(task, PTRACE_MODE_ATTACH_FSCREDS); + if (IS_ERR_OR_NULL(mm)) { + ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; + goto release_task; + } + + total_len = iov_iter_count(&iter); + + while (iov_iter_count(&iter)) { + iovec = iov_iter_iovec(&iter); + ret = do_madvise(mm, (unsigned long)iovec.iov_base, + iovec.iov_len, behavior); + if (ret < 0) + break; + iov_iter_advance(&iter, iovec.iov_len); + } + + if (ret == 0) + ret = total_len - iov_iter_count(&iter); + + mmput(mm); + return ret; + +release_task: + put_task_struct(task); +put_pid: + put_pid(pid); +free_iov: + kfree(iov); +out: + return ret; +}