From 0a313a998adbae19c1309f80a3ad79107fff7c4e Mon Sep 17 00:00:00 2001 From: Xishi Qiu Date: Tue, 9 Sep 2014 14:50:46 -0700 Subject: [PATCH 01/10] mem-hotplug: let memblock skip the hotpluggable memory regions in __next_mem_range() Let memblock skip the hotpluggable memory regions in __next_mem_range(), it is used to to prevent memblock from allocating hotpluggable memory for the kernel at early time. The code is the same as __next_mem_range_rev(). Clear hotpluggable flag before releasing free pages to the buddy allocator. If we don't clear hotpluggable flag in free_low_memory_core_early(), the memory which marked hotpluggable flag will not free to buddy allocator. Because __next_mem_range() will skip them. free_low_memory_core_early for_each_free_mem_range for_each_mem_range __next_mem_range [akpm@linux-foundation.org: fix warning] Signed-off-by: Xishi Qiu Cc: Tejun Heo Cc: Tang Chen Cc: Zhang Yanfei Cc: Wen Congyang Cc: "Rafael J. Wysocki" Cc: "H. Peter Anvin" Cc: Wu Fengguang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 4 ++++ mm/nobootmem.c | 2 ++ 2 files changed, 6 insertions(+) diff --git a/mm/memblock.c b/mm/memblock.c index 70fad0c0dafb..6ecb0d937fb5 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -816,6 +816,10 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, if (nid != NUMA_NO_NODE && nid != m_nid) continue; + /* skip hotpluggable memory regions if needed */ + if (movable_node_is_enabled() && memblock_is_hotpluggable(m)) + continue; + if (!type_b) { if (out_start) *out_start = m_start; diff --git a/mm/nobootmem.c b/mm/nobootmem.c index 7ed58602e71b..7c7ab32ee503 100644 --- a/mm/nobootmem.c +++ b/mm/nobootmem.c @@ -119,6 +119,8 @@ static unsigned long __init free_low_memory_core_early(void) phys_addr_t start, end; u64 i; + memblock_clear_hotplug(0, -1); + for_each_free_mem_range(i, NUMA_NO_NODE, &start, &end, NULL) count += __free_memory_core(start, end); From 000a7d66ec30898f46869be01ab8205b056385d0 Mon Sep 17 00:00:00 2001 From: Patrick Palka Date: Tue, 9 Sep 2014 14:50:48 -0700 Subject: [PATCH 02/10] kernel/printk/printk.c: fix faulty logic in the case of recursive printk We shouldn't set text_len in the code path that detects printk recursion because text_len corresponds to the length of the string inside textbuf. A few lines down from the line text_len = strlen(recursion_msg); is the line text_len += vscnprintf(text + text_len, ...); So if printk detects recursion, it sets text_len to 29 (the length of recursion_msg) and logs an error. Then the message supplied by the caller of printk is stored inside textbuf but offset by 29 bytes. This means that the output of the recursive call to printk will contain 29 bytes of garbage in front of it. This defect is caused by commit 458df9fd4815 ("printk: remove separate printk_sched buffers and use printk buf instead") which turned the line text_len = vscnprintf(text, ...); into text_len += vscnprintf(text + text_len, ...); To fix this, this patch avoids setting text_len when logging the printk recursion error. This patch also marks unlikely() the branch leading up to this code. Fixes: 458df9fd4815b478 ("printk: remove separate printk_sched buffers and use printk buf instead") Signed-off-by: Patrick Palka Reviewed-by: Petr Mladek Reviewed-by: Jan Kara Acked-by: Steven Rostedt Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/printk/printk.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index e04c455a0e38..1ce770687ea8 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -1665,15 +1665,15 @@ asmlinkage int vprintk_emit(int facility, int level, raw_spin_lock(&logbuf_lock); logbuf_cpu = this_cpu; - if (recursion_bug) { + if (unlikely(recursion_bug)) { static const char recursion_msg[] = "BUG: recent printk recursion!"; recursion_bug = 0; - text_len = strlen(recursion_msg); /* emit KERN_CRIT message */ printed_len += log_store(0, 2, LOG_PREFIX|LOG_NEWLINE, 0, - NULL, 0, recursion_msg, text_len); + NULL, 0, recursion_msg, + strlen(recursion_msg)); } /* From c680e41b3a2e944185c74bf60531e3d316d3ecc4 Mon Sep 17 00:00:00 2001 From: Nicolas Iooss Date: Tue, 9 Sep 2014 14:50:51 -0700 Subject: [PATCH 03/10] eventpoll: fix uninitialized variable in epoll_ctl MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When calling epoll_ctl with operation EPOLL_CTL_DEL, structure epds is not initialized but ep_take_care_of_epollwakeup reads its event field. When this unintialized field has EPOLLWAKEUP bit set, a capability check is done for CAP_BLOCK_SUSPEND in ep_take_care_of_epollwakeup. This produces unexpected messages in the audit log, such as (on a system running SELinux): type=AVC msg=audit(1408212798.866:410): avc: denied { block_suspend } for pid=7754 comm="dbus-daemon" capability=36 scontext=unconfined_u:unconfined_r:unconfined_t tcontext=unconfined_u:unconfined_r:unconfined_t tclass=capability2 permissive=1 type=SYSCALL msg=audit(1408212798.866:410): arch=c000003e syscall=233 success=yes exit=0 a0=3 a1=2 a2=9 a3=7fffd4d66ec0 items=0 ppid=1 pid=7754 auid=1000 uid=0 gid=0 euid=0 suid=0 fsuid=0 egid=0 sgid=0 fsgid=0 tty=(none) ses=3 comm="dbus-daemon" exe="/usr/bin/dbus-daemon" subj=unconfined_u:unconfined_r:unconfined_t key=(null) ("arch=c000003e syscall=233 a1=2" means "epoll_ctl(op=EPOLL_CTL_DEL)") Remove use of epds in epoll_ctl when op == EPOLL_CTL_DEL. Fixes: 4d7e30d98939 ("epoll: Add a flag, EPOLLWAKEUP, to prevent suspend while epoll events are ready") Signed-off-by: Nicolas Iooss Cc: Alexander Viro Cc: Arve Hjønnevåg Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/eventpoll.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/eventpoll.c b/fs/eventpoll.c index b10b48c2a7af..7bcfff900f05 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -1852,7 +1852,8 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, goto error_tgt_fput; /* Check if EPOLLWAKEUP is allowed */ - ep_take_care_of_epollwakeup(&epds); + if (ep_op_has_event(op)) + ep_take_care_of_epollwakeup(&epds); /* * We have to check that the file structure underneath the file descriptor From caac7e6d00d3ddc888bd8169e75a02f962efdcff Mon Sep 17 00:00:00 2001 From: Stas Sergeev Date: Tue, 9 Sep 2014 14:50:53 -0700 Subject: [PATCH 04/10] sh: get_user_pages_fast() must flush cache This patch avoids fuse hangs on sh4 by flushing the cache on get_user_pages_fast(). This is not necessary a good thing to do, but get_user_pages() does this, so get_user_pages_fast() should too. Please note the patch for mips arch that addresses the similar problem: https://kernel.googlesource.com/pub/scm/linux/kernel/git/ralf/linux/+/linux-3.4.50%5E!/#F0 They basically simply disable get_user_pages_fast() at all, using a fall-back to get_user_pages(). But my fix is different, it adds an explicit cache flushes. Signed-off-by: Stas Sergeev Cc: Geert Uytterhoeven Cc: Kamal Dasu Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sh/mm/gup.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/sh/mm/gup.c b/arch/sh/mm/gup.c index bf8daf9d9c9b..37458f38b220 100644 --- a/arch/sh/mm/gup.c +++ b/arch/sh/mm/gup.c @@ -105,6 +105,8 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr, VM_BUG_ON(!pfn_valid(pte_pfn(pte))); page = pte_page(pte); get_page(page); + __flush_anon_page(page, addr); + flush_dcache_page(page); pages[*nr] = page; (*nr)++; From 66881735071e54283df38ffbd9584c63e3661b9f Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Tue, 9 Sep 2014 14:50:55 -0700 Subject: [PATCH 05/10] checkpatch: allow commit descriptions on separate line from commit id The general form for commit id and description is 'Commit <12+hexdigits> ("commit description/subject line")' but commit logs often have relatively long commit ids and the commit description emds on the next line like: Some explanation as to why commit <12+hexdigits> ("commit foo description/subject line") is improved. Allow this form. Signed-off-by: Joe Perches Suggested-by: Joe Lawrence Tested-by: Joe Lawrence Suggested-by: Geert Uytterhoeven Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index b385bcbbf2f5..4d08b398411f 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2133,7 +2133,10 @@ sub process { # Check for improperly formed commit descriptions if ($in_commit_log && $line =~ /\bcommit\s+[0-9a-f]{5,}/i && - $line !~ /\b[Cc]ommit [0-9a-f]{12,40} \("/) { + !($line =~ /\b[Cc]ommit [0-9a-f]{12,40} \("/ || + ($line =~ /\b[Cc]ommit [0-9a-f]{12,40}\s*$/ && + defined $rawlines[$linenr] && + $rawlines[$linenr] =~ /^\s*\("/))) { $line =~ /\b(c)ommit\s+([0-9a-f]{5,})/i; my $init_char = $1; my $orig_commit = lc($2); From b01d072065b6f36550f486fe77f05b092225ba1b Mon Sep 17 00:00:00 2001 From: David Drysdale Date: Tue, 9 Sep 2014 14:50:57 -0700 Subject: [PATCH 06/10] shm: add memfd.h to UAPI export list The new header file memfd.h from commit 9183df25fe7b ("shm: add memfd_create() syscall") should be exported. Signed-off-by: David Drysdale Reviewed-by: David Herrmann Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/uapi/linux/Kbuild | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index 24e9033f8b3f..939df6973102 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -240,6 +240,7 @@ header-y += matroxfb.h header-y += mdio.h header-y += media.h header-y += mei.h +header-y += memfd.h header-y += mempolicy.h header-y += meye.h header-y += mic_common.h From 8542bdfc6632b55aa1cf4fa255283c878b662499 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Tue, 9 Sep 2014 14:50:59 -0700 Subject: [PATCH 07/10] mm/mmap.c: use pr_emerg when printing BUG related information Make sure we actually see the output of validate_mm() and browse_rb() before triggering a BUG(). pr_info isn't shown by default so the reason for the BUG() isn't obvious. Signed-off-by: Sasha Levin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mmap.c | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/mm/mmap.c b/mm/mmap.c index c1f2ea4a0b99..c0a3637cdb64 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -369,20 +369,20 @@ static int browse_rb(struct rb_root *root) struct vm_area_struct *vma; vma = rb_entry(nd, struct vm_area_struct, vm_rb); if (vma->vm_start < prev) { - pr_info("vm_start %lx prev %lx\n", vma->vm_start, prev); + pr_emerg("vm_start %lx prev %lx\n", vma->vm_start, prev); bug = 1; } if (vma->vm_start < pend) { - pr_info("vm_start %lx pend %lx\n", vma->vm_start, pend); + pr_emerg("vm_start %lx pend %lx\n", vma->vm_start, pend); bug = 1; } if (vma->vm_start > vma->vm_end) { - pr_info("vm_end %lx < vm_start %lx\n", + pr_emerg("vm_end %lx < vm_start %lx\n", vma->vm_end, vma->vm_start); bug = 1; } if (vma->rb_subtree_gap != vma_compute_subtree_gap(vma)) { - pr_info("free gap %lx, correct %lx\n", + pr_emerg("free gap %lx, correct %lx\n", vma->rb_subtree_gap, vma_compute_subtree_gap(vma)); bug = 1; @@ -396,7 +396,7 @@ static int browse_rb(struct rb_root *root) for (nd = pn; nd; nd = rb_prev(nd)) j++; if (i != j) { - pr_info("backwards %d, forwards %d\n", j, i); + pr_emerg("backwards %d, forwards %d\n", j, i); bug = 1; } return bug ? -1 : i; @@ -431,17 +431,17 @@ static void validate_mm(struct mm_struct *mm) i++; } if (i != mm->map_count) { - pr_info("map_count %d vm_next %d\n", mm->map_count, i); + pr_emerg("map_count %d vm_next %d\n", mm->map_count, i); bug = 1; } if (highest_address != mm->highest_vm_end) { - pr_info("mm->highest_vm_end %lx, found %lx\n", + pr_emerg("mm->highest_vm_end %lx, found %lx\n", mm->highest_vm_end, highest_address); bug = 1; } i = browse_rb(&mm->mm_rb); if (i != mm->map_count) { - pr_info("map_count %d rb %d\n", mm->map_count, i); + pr_emerg("map_count %d rb %d\n", mm->map_count, i); bug = 1; } BUG_ON(bug); From acbbe6fbb240a927ee1f5994f04d31267d422215 Mon Sep 17 00:00:00 2001 From: Rasmus Villemoes Date: Tue, 9 Sep 2014 14:51:01 -0700 Subject: [PATCH 08/10] kcmp: fix standard comparison bug The C operator <= defines a perfectly fine total ordering on the set of values representable in a long. However, unlike its namesake in the integers, it is not translation invariant, meaning that we do not have "b <= c" iff "a+b <= a+c" for all a,b,c. This means that it is always wrong to try to boil down the relationship between two longs to a question about the sign of their difference, because the resulting relation [a LEQ b iff a-b <= 0] is neither anti-symmetric or transitive. The former is due to -LONG_MIN==LONG_MIN (take any two a,b with a-b = LONG_MIN; then a LEQ b and b LEQ a, but a != b). The latter can either be seen observing that x LEQ x+1 for all x, implying x LEQ x+1 LEQ x+2 ... LEQ x-1 LEQ x; or more directly with the simple example a=LONG_MIN, b=0, c=1, for which a-b < 0, b-c < 0, but a-c > 0. Note that it makes absolutely no difference that a transmogrying bijection has been applied before the comparison is done. In fact, had the obfuscation not been done, one could probably not observe the bug (assuming all values being compared always lie in one half of the address space, the mathematical value of a-b is always representable in a long). As it stands, one can easily obtain three file descriptors exhibiting the non-transitivity of kcmp(). Side note 1: I can't see that ensuring the MSB of the multiplier is set serves any purpose other than obfuscating the obfuscating code. Side note 2: #include #include #include #include #include #include #include enum kcmp_type { KCMP_FILE, KCMP_VM, KCMP_FILES, KCMP_FS, KCMP_SIGHAND, KCMP_IO, KCMP_SYSVSEM, KCMP_TYPES, }; pid_t pid; int kcmp(pid_t pid1, pid_t pid2, int type, unsigned long idx1, unsigned long idx2) { return syscall(SYS_kcmp, pid1, pid2, type, idx1, idx2); } int cmp_fd(int fd1, int fd2) { int c = kcmp(pid, pid, KCMP_FILE, fd1, fd2); if (c < 0) { perror("kcmp"); exit(1); } assert(0 <= c && c < 3); return c; } int cmp_fdp(const void *a, const void *b) { static const int normalize[] = {0, -1, 1}; return normalize[cmp_fd(*(int*)a, *(int*)b)]; } #define MAX 100 /* This is plenty; I've seen it trigger for MAX==3 */ int main(int argc, char *argv[]) { int r, s, count = 0; int REL[3] = {0,0,0}; int fd[MAX]; pid = getpid(); while (count < MAX) { r = open("/dev/null", O_RDONLY); if (r < 0) break; fd[count++] = r; } printf("opened %d file descriptors\n", count); for (r = 0; r < count; ++r) { for (s = r+1; s < count; ++s) { REL[cmp_fd(fd[r], fd[s])]++; } } printf("== %d\t< %d\t> %d\n", REL[0], REL[1], REL[2]); qsort(fd, count, sizeof(fd[0]), cmp_fdp); memset(REL, 0, sizeof(REL)); for (r = 0; r < count; ++r) { for (s = r+1; s < count; ++s) { REL[cmp_fd(fd[r], fd[s])]++; } } printf("== %d\t< %d\t> %d\n", REL[0], REL[1], REL[2]); return (REL[0] + REL[2] != 0); } Signed-off-by: Rasmus Villemoes Reviewed-by: Cyrill Gorcunov "Eric W. Biederman" Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/kcmp.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/kernel/kcmp.c b/kernel/kcmp.c index e30ac0fe61c3..0aa69ea1d8fd 100644 --- a/kernel/kcmp.c +++ b/kernel/kcmp.c @@ -44,11 +44,12 @@ static long kptr_obfuscate(long v, int type) */ static int kcmp_ptr(void *v1, void *v2, enum kcmp_type type) { - long ret; + long t1, t2; - ret = kptr_obfuscate((long)v1, type) - kptr_obfuscate((long)v2, type); + t1 = kptr_obfuscate((long)v1, type); + t2 = kptr_obfuscate((long)v2, type); - return (ret < 0) | ((ret > 0) << 1); + return (t1 < t2) | ((t1 > t2) << 1); } /* The caller must have pinned the task */ From 1fc98d11cac6dd66342e5580cb2687e5b1e9a613 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Sep 2014 14:51:04 -0700 Subject: [PATCH 09/10] fsnotify/fdinfo: use named constants instead of hardcoded values MAX_HANDLE_SZ is equal to 128, but currently the size of pad is only 64 bytes, so exportfs_encode_inode_fh can return an error. Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 238a5930cb3c..660d33bc1bef 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -42,7 +42,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) { struct { struct file_handle handle; - u8 pad[64]; + u8 pad[MAX_HANDLE_SZ]; } f; int size, ret, i; @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == 255) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; } From 7e8824816bda16bb11ff5ff1e1212d642e57b0b3 Mon Sep 17 00:00:00 2001 From: Andrey Vagin Date: Tue, 9 Sep 2014 14:51:06 -0700 Subject: [PATCH 10/10] fs/notify: don't show f_handle if exportfs_encode_inode_fh failed Currently we handle only ENOSPC. In case of other errors the file_handle variable isn't filled properly and we will show a part of stack. Signed-off-by: Andrey Vagin Acked-by: Cyrill Gorcunov Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/notify/fdinfo.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index 660d33bc1bef..9d7e2b9659cb 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -50,7 +50,7 @@ static int show_mark_fhandle(struct seq_file *m, struct inode *inode) size = f.handle.handle_bytes >> 2; ret = exportfs_encode_inode_fh(inode, (struct fid *)f.handle.f_handle, &size, 0); - if ((ret == FILEID_INVALID) || (ret == -ENOSPC)) { + if ((ret == FILEID_INVALID) || (ret < 0)) { WARN_ONCE(1, "Can't encode file handler for inotify: %d\n", ret); return 0; }