From ee190ca6516bc8257e3d36187ca6f0f71a9ec477 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Jan=20H=2E=20Sch=C3=B6nherr?= Date: Wed, 31 Jan 2018 16:14:04 -0800 Subject: [PATCH 001/118] fs/dax.c: release PMD lock even when there is no PMD support in DAX MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit follow_pte_pmd() can theoretically return after having acquired a PMD lock, even when DAX was not compiled with CONFIG_FS_DAX_PMD. Release the PMD lock unconditionally. Link: http://lkml.kernel.org/r/20180118133839.20587-1-jschoenh@amazon.de Fixes: f729c8c9b24f ("dax: wrprotect pmd_t in dax_mapping_entry_mkclean") Signed-off-by: Jan H. Schönherr Reviewed-by: Ross Zwisler Reviewed-by: Andrew Morton Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/dax.c b/fs/dax.c index 95981591977a..c2ebf10b70da 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -636,8 +636,8 @@ static void dax_mapping_entry_mkclean(struct address_space *mapping, pmd = pmd_mkclean(pmd); set_pmd_at(vma->vm_mm, address, pmdp, pmd); unlock_pmd: - spin_unlock(ptl); #endif + spin_unlock(ptl); } else { if (pfn != pte_pfn(*ptep)) goto unlock_pte; From 7e68b36145788e2e52824200edf15a6e59ea8a45 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Wed, 31 Jan 2018 16:14:10 -0800 Subject: [PATCH 002/118] scripts/decodecode: make it take multiline Code line In case of running scripts/decodecode without any parameters in order to give a copy'n'pasted Code line from, for example, email it would parse only first line of it, while in emails it's split to few. ie, when you have a file out of oops the Code line looks like Code: hh hh ... ... hh\n When copy'n'paste from, for example, email where sender or some middle MTA split it, the line looks like: Code: hh hh ... hh\n hh ... ... hh\n hh hh ... hh\n The Code line followed by another oops line usually contains characters out of hex digit + space + < + > set. So add logic to join this split back if and only if the following lines have hex digits, or spaces, or '<', or '>' characters. It will be quite unlikely to have a broken input in well formed Oops or dmesg, thus a simple regex is being used. Link: http://lkml.kernel.org/r/20171212100323.33201-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Cc: Will Deacon Cc: Dave Martin Cc: Philippe Ombredanne Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/decodecode | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/scripts/decodecode b/scripts/decodecode index 5ea071099330..9cef558528aa 100755 --- a/scripts/decodecode +++ b/scripts/decodecode @@ -21,12 +21,24 @@ trap cleanup EXIT T=`mktemp` || die "cannot create temp file" code= +cont= while read i ; do case "$i" in *Code:*) code=$i + cont=yes + ;; +*) + [ -n "$cont" ] && { + xdump="$(echo $i | grep '^[[:xdigit:]<>[:space:]]\+$')" + if [ -n "$xdump" ]; then + code="$code $xdump" + else + cont= + fi + } ;; esac From 99443f811c452c693d2f539debb7aea203ed5091 Mon Sep 17 00:00:00 2001 From: Arend van Spriel Date: Wed, 31 Jan 2018 16:14:14 -0800 Subject: [PATCH 003/118] scripts/tags.sh: change find_other_sources() for include directories The current find done in find_other_sources() excludes directories in the kernel tree that are named 'include', eg.: ./security/apparmor/include ./security/selinux/include ./drivers/net/wireless/broadcom/brcm80211/include ./drivers/gpu/drm/amd/acp/include ./drivers/gpu/drm/amd/display/include ./drivers/gpu/drm/amd/include ./drivers/gpu/drm/nouveau/include This changes the find command in find_other_sources() to include those using the -path option. Link: http://lkml.kernel.org/r/1513335768-7852-1-git-send-email-arend.vanspriel@broadcom.com Signed-off-by: Arend van Spriel Cc: Robert Jarzmik Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/tags.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/scripts/tags.sh b/scripts/tags.sh index d23dcbf17457..78e546ff689c 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -77,7 +77,7 @@ find_include_sources() find_other_sources() { find ${tree}* $ignore \ - \( -name include -o -name arch -o -name '.tmp_*' \) -prune -o \ + \( -path ${tree}include -o -path ${tree}arch -o -name '.tmp_*' \) -prune -o \ -name "$1" -not -type l -print; } From d91dad45ba90859385767f91de2197752e52431d Mon Sep 17 00:00:00 2001 From: Sudip Mukherjee Date: Wed, 31 Jan 2018 16:14:17 -0800 Subject: [PATCH 004/118] m32r: remove abort() Commit 7c2c11b208be ("arch: define weak abort()") has introduced a weak abort() which is common for all arch. And, so we will not need arch specific abort which has the same code as the weak abort(). Remove the abort() for m32r. Link: http://lkml.kernel.org/r/1516912339-5665-1-git-send-email-sudipm.mukherjee@gmail.com Signed-off-by: Sudip Mukherjee Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/m32r/kernel/traps.c | 8 -------- 1 file changed, 8 deletions(-) diff --git a/arch/m32r/kernel/traps.c b/arch/m32r/kernel/traps.c index b88a8dd14933..a6f300a208bd 100644 --- a/arch/m32r/kernel/traps.c +++ b/arch/m32r/kernel/traps.c @@ -115,14 +115,6 @@ static void set_eit_vector_entries(void) _flush_cache_copyback_all(); } -void abort(void) -{ - BUG(); - - /* if that doesn't kill us, halt */ - panic("Oops failed to kill thread"); -} - void __init trap_init(void) { set_eit_vector_entries(); From e37b963cfc37f92da1fa4eb53a88f33a5d8cd664 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:14:21 -0800 Subject: [PATCH 005/118] fs/ocfs2/dlm/dlmmaster.c: clean up dead code This code has been commented out for 12 years. Remove it. Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373CED7EF9E@H3CMLB14-EX.srv.huawei-3com.com Signed-off-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlm/dlmmaster.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index 9c3e0f13ca87..a7df226f9449 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1122,13 +1122,6 @@ recheck: /* sleep if we haven't finished voting yet */ if (sleep) { unsigned long timeo = msecs_to_jiffies(DLM_MASTERY_TIMEOUT_MS); - - /* - if (kref_read(&mle->mle_refs) < 2) - mlog(ML_ERROR, "mle (%p) refs=%d, name=%.*s\n", mle, - kref_read(&mle->mle_refs), - res->lockname.len, res->lockname.name); - */ atomic_set(&mle->woken, 0); (void)wait_event_timeout(mle->wq, (atomic_read(&mle->woken) == 1), From cfdce25cb9e47f7ddd817a9070f675d425d3fc4a Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:14:25 -0800 Subject: [PATCH 006/118] ocfs2/cluster: neaten a member of o2net_msg_handler It's odd that o2net_msg_handler::nh_func_data is declared as type o2net_msg_handler_func*. So neaten it. Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F1F554DA@H3CMLB14-EX.srv.huawei-3com.com Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/tcp_internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/cluster/tcp_internal.h b/fs/ocfs2/cluster/tcp_internal.h index b95e7df5b76a..0276f7f8d5e6 100644 --- a/fs/ocfs2/cluster/tcp_internal.h +++ b/fs/ocfs2/cluster/tcp_internal.h @@ -196,7 +196,7 @@ struct o2net_msg_handler { u32 nh_msg_type; u32 nh_key; o2net_msg_handler_func *nh_func; - o2net_msg_handler_func *nh_func_data; + void *nh_func_data; o2net_post_msg_handler_func *nh_post_func; struct kref nh_kref; From a52370b3b182f792b4de4fc4c611b829aec11953 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:14:29 -0800 Subject: [PATCH 007/118] ocfs2: give an obvious tip for mismatched cluster names Add an obvious error message, due to mismatched cluster names between on-disk and in the current cluster. We can meet this case during OCFS2 cluster migration. If we can give the user an obvious tip for why they can not mount the file system after migration, they can quickly fix this mismatch problem. Second, also move printing ocfs2_fill_super() errno to the front of ocfs2_dismount_volume(), since ocfs2_dismount_volume() will also print its own message. I looked through all the code of OCFS2 (include o2cb); there is not any place which returns this error. In fact, the function calling path ocfs2_fill_super -> ocfs2_mount_volume -> ocfs2_dlm_init -> dlm_new_lockspace is a very specific one. We can use this errno to give the user a more clear tip, since this case is a little common during cluster migration, but the customer can quickly get the failure cause if there is a error printed. Also, I think it is not possible to add this errno in the o2cb path during ocfs2_dlm_init(), since the o2cb code has been stable for a long time. We only print this error tip when the user uses pcmk stack, since using the o2cb stack the user will not meet this error. [ghe@suse.com: v2] Link: http://lkml.kernel.org/r/1495419305-3780-1-git-send-email-ghe@suse.com Link: http://lkml.kernel.org/r/1495089336-19312-1-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Mark Fasheh Acked-by: Joseph Qi Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 80efa5699fb0..350066e9d60b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1208,14 +1208,15 @@ static int ocfs2_fill_super(struct super_block *sb, void *data, int silent) read_super_error: brelse(bh); + if (status) + mlog_errno(status); + if (osb) { atomic_set(&osb->vol_state, VOLUME_DISABLED); wake_up(&osb->osb_mount_event); ocfs2_dismount_volume(sb, 1); } - if (status) - mlog_errno(status); return status; } @@ -1843,6 +1844,9 @@ static int ocfs2_mount_volume(struct super_block *sb) status = ocfs2_dlm_init(osb); if (status < 0) { mlog_errno(status); + if (status == -EBADR && ocfs2_userspace_stack(osb)) + mlog(ML_ERROR, "couldn't mount because cluster name on" + " disk does not match the running cluster name.\n"); goto leave; } From fc2af28bd91561a30067c26a85776d1e457b4ad7 Mon Sep 17 00:00:00 2001 From: Yang Zhang Date: Wed, 31 Jan 2018 16:14:33 -0800 Subject: [PATCH 008/118] ocfs2/cluster: close a race that fence can't be triggered When some nodes of cluster face with TCP connection fault, ocfs2 will pick up a quorum to continue to work and other nodes will be fenced by resetting host. In order to decide which node should be fenced, ocfs2 leverages o2quo_state::qs_holds. If that variable is reduced to zero, then a try to decide if fence local node is performed. However, under a specific scenario that local node is not disconnected from others at the same time, above method has a problem to reduce ::qs_holds to zero. Because, o2net 90s idle timer corresponding to different nodes is triggered one after another. node 2 node 3 90s idle timer elapses clear ::qs_conn_bm set hold 40s is passed 90 idle timer elapses clear ::qs_conn_bm set hold still up timer elapses clear hold (NOT to zero ) 90s idle timer elapses AGAIN still up timer elapses. clear hold still up timer elapses To solve this issue, a node which has already be evicted from ::qs_conn_bm can't set hold again and again invoked from idle timer. Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F1F3F93B@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: Yang Zhang Signed-off-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/cluster/quorum.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/cluster/quorum.c b/fs/ocfs2/cluster/quorum.c index 62e8ec619b4c..af2e7473956e 100644 --- a/fs/ocfs2/cluster/quorum.c +++ b/fs/ocfs2/cluster/quorum.c @@ -314,12 +314,13 @@ void o2quo_conn_err(u8 node) node, qs->qs_connected); clear_bit(node, qs->qs_conn_bm); + + if (test_bit(node, qs->qs_hb_bm)) + o2quo_set_hold(qs, node); } mlog(0, "node %u, %d total\n", node, qs->qs_connected); - if (test_bit(node, qs->qs_hb_bm)) - o2quo_set_hold(qs, node); spin_unlock(&qs->qs_lock); } From 32ed0bd743367f40ff1d15da2e81e1eca63b7edc Mon Sep 17 00:00:00 2001 From: alex chen Date: Wed, 31 Jan 2018 16:14:36 -0800 Subject: [PATCH 009/118] ocfs2: use the OCFS2_XATTR_ROOT_SIZE macro in ocfs2_reflink_xattr_header() Use the OCFS2_XATTR_ROOT_SIZE macro improves the readability of the code. Link: http://lkml.kernel.org/r/5A2E2488.70301@huawei.com Signed-off-by: Alex Chen Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index c5898c59d411..2423e905ec1a 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6415,7 +6415,7 @@ static int ocfs2_reflink_xattr_header(handle_t *handle, * and then insert the extents one by one. */ if (xv->xr_list.l_tree_depth) { - memcpy(new_xv, &def_xv, sizeof(def_xv)); + memcpy(new_xv, &def_xv, OCFS2_XATTR_ROOT_SIZE); vb->vb_xv = new_xv; vb->vb_bh = value_bh; ocfs2_init_xattr_value_extent_tree(&data_et, From dd7b5f9d01ff4af8c6a55cf029032868bc12a474 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:14:40 -0800 Subject: [PATCH 010/118] ocfs2: clean dead code in suballoc.c Stack variable fe is no longer used, so trim it to save some CPU cycles and stack space. Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F1F5A8DD@H3CMLB14-EX.srv.huawei-3com.com Signed-off-by: Changwei Ge Reviewed-by: Joseph Qi Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 9f0b95abc09f..2d8d31c85f45 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2563,16 +2563,16 @@ static int _ocfs2_free_clusters(handle_t *handle, int status; u16 bg_start_bit; u64 bg_blkno; - struct ocfs2_dinode *fe; /* You can't ever have a contiguous set of clusters * bigger than a block group bitmap so we never have to worry * about looping on them. * This is expensive. We can safely remove once this stuff has * gotten tested really well. */ - BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, ocfs2_blocks_to_clusters(bitmap_inode->i_sb, start_blk))); + BUG_ON(start_blk != ocfs2_clusters_to_blocks(bitmap_inode->i_sb, + ocfs2_blocks_to_clusters(bitmap_inode->i_sb, + start_blk))); - fe = (struct ocfs2_dinode *) bitmap_bh->b_data; ocfs2_block_to_cluster_group(bitmap_inode, start_blk, &bg_blkno, &bg_start_bit); From 025bcbde3634b2c9b316f227fed13ad6ad6817fb Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 31 Jan 2018 16:14:44 -0800 Subject: [PATCH 011/118] ocfs2: return -EROFS to mount.ocfs2 if inode block is invalid If metadata is corrupted such as 'invalid inode block', we will get failed by calling 'mount()' and then set filesystem readonly as below: ocfs2_mount ocfs2_initialize_super ocfs2_init_global_system_inodes ocfs2_iget ocfs2_read_locked_inode ocfs2_validate_inode_block ocfs2_error ocfs2_handle_error ocfs2_set_ro_flag(osb, 0); // set readonly In this situation we need return -EROFS to 'mount.ocfs2', so that user can fix it by fsck. And then mount again. In addition, 'mount.ocfs2' should be updated correspondingly as it only return 1 for all errno. And I will post a patch for 'mount.ocfs2' too. Link: http://lkml.kernel.org/r/5A4302FA.2010606@huawei.com Signed-off-by: Jun Piao Reviewed-by: Alex Chen Reviewed-by: Joseph Qi Reviewed-by: Changwei Ge Reviewed-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/super.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 350066e9d60b..ffa4952d432b 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -474,9 +474,8 @@ static int ocfs2_init_global_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog_errno(status); - /* FIXME: Should ERROR_RO_FS */ mlog(ML_ERROR, "Unable to load system inode %d, " "possibly corrupt fs?", i); goto bail; @@ -505,7 +504,7 @@ static int ocfs2_init_local_system_inodes(struct ocfs2_super *osb) new = ocfs2_get_system_file_inode(osb, i, osb->slot_num); if (!new) { ocfs2_release_system_inodes(osb); - status = -EINVAL; + status = ocfs2_is_soft_readonly(osb) ? -EROFS : -EINVAL; mlog(ML_ERROR, "status=%d, sysfile=%d, slot=%d\n", status, i, osb->slot_num); goto bail; From ff26cc10aec128c3f86b5611fd5f59c71d49c0e3 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:14:48 -0800 Subject: [PATCH 012/118] ocfs2: try a blocking lock before return AOP_TRUNCATED_PAGE If we can't get inode lock immediately in the function ocfs2_inode_lock_with_page() when reading a page, we should not return directly here, since this will lead to a softlockup problem when the kernel is configured with CONFIG_PREEMPT is not set. The method is to get a blocking lock and immediately unlock before returning, this can avoid CPU resource waste due to lots of retries, and benefits fairness in getting lock among multiple nodes, increase efficiency in case modifying the same file frequently from multiple nodes. The softlockup crash (when set /proc/sys/kernel/softlockup_panic to 1) looks like: Kernel panic - not syncing: softlockup: hung tasks CPU: 0 PID: 885 Comm: multi_mmap Tainted: G L 4.12.14-6.1-default #1 Hardware name: Bochs Bochs, BIOS Bochs 01/01/2011 Call Trace: dump_stack+0x5c/0x82 panic+0xd5/0x21e watchdog_timer_fn+0x208/0x210 __hrtimer_run_queues+0xcc/0x200 hrtimer_interrupt+0xa6/0x1f0 smp_apic_timer_interrupt+0x34/0x50 apic_timer_interrupt+0x96/0xa0 RIP: 0010:unlock_page+0x17/0x30 RSP: 0000:ffffaf154080bc88 EFLAGS: 00000246 ORIG_RAX: ffffffffffffff10 RAX: dead000000000100 RBX: fffff21e009f5300 RCX: 0000000000000004 RDX: dead0000000000ff RSI: 0000000000000202 RDI: fffff21e009f5300 RBP: 0000000000000000 R08: 0000000000000000 R09: ffffaf154080bb00 R10: ffffaf154080bc30 R11: 0000000000000040 R12: ffff993749a39518 R13: 0000000000000000 R14: fffff21e009f5300 R15: fffff21e009f5300 ocfs2_inode_lock_with_page+0x25/0x30 [ocfs2] ocfs2_readpage+0x41/0x2d0 [ocfs2] filemap_fault+0x12b/0x5c0 ocfs2_fault+0x29/0xb0 [ocfs2] __do_fault+0x1a/0xa0 __handle_mm_fault+0xbe8/0x1090 handle_mm_fault+0xaa/0x1f0 __do_page_fault+0x235/0x4b0 trace_do_page_fault+0x3c/0x110 async_page_fault+0x28/0x30 RIP: 0033:0x7fa75ded638e RSP: 002b:00007ffd6657db18 EFLAGS: 00010287 RAX: 000055c7662fb700 RBX: 0000000000000001 RCX: 000055c7662fb700 RDX: 0000000000001770 RSI: 00007fa75e909000 RDI: 000055c7662fb700 RBP: 0000000000000003 R08: 000000000000000e R09: 0000000000000000 R10: 0000000000000483 R11: 00007fa75ded61b0 R12: 00007fa75e90a770 R13: 000000000000000e R14: 0000000000001770 R15: 0000000000000000 About performance improvement, we can see the testing time is reduced, and CPU utilization decreases, the detailed data is as follows. I ran multi_mmap test case in ocfs2-test package in a three nodes cluster. Before applying this patch: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2754 ocfs2te+ 20 0 170248 6980 4856 D 80.73 0.341 0:18.71 multi_mmap 1505 root rt 0 222236 123060 97224 S 2.658 6.015 0:01.44 corosync 5 root 20 0 0 0 0 S 1.329 0.000 0:00.19 kworker/u8:0 95 root 20 0 0 0 0 S 1.329 0.000 0:00.25 kworker/u8:1 2728 root 20 0 0 0 0 S 0.997 0.000 0:00.24 jbd2/sda1-33 2721 root 20 0 0 0 0 S 0.664 0.000 0:00.07 ocfs2dc-3C8CFD4 2750 ocfs2te+ 20 0 142976 4652 3532 S 0.664 0.227 0:00.28 mpirun ocfs2test@tb-node2:~>multiple_run.sh -i ens3 -k ~/linux-4.4.21-69.tar.gz -o ~/ocfs2mullog -C hacluster -s pcmk -n tb-node2,tb-node1,tb-node3 -d /dev/sda1 -b 4096 -c 32768 -t multi_mmap /mnt/shared Tests with "-b 4096 -C 32768" Thu Dec 28 14:44:52 CST 2017 multi_mmap..................................................Passed. Runtime 783 seconds. After apply this patch: PID USER PR NI VIRT RES SHR S %CPU %MEM TIME+ COMMAND 2508 ocfs2te+ 20 0 170248 6804 4680 R 54.00 0.333 0:55.37 multi_mmap 155 root 20 0 0 0 0 S 2.667 0.000 0:01.20 kworker/u8:3 95 root 20 0 0 0 0 S 2.000 0.000 0:01.58 kworker/u8:1 2504 ocfs2te+ 20 0 142976 4604 3480 R 1.667 0.225 0:01.65 mpirun 5 root 20 0 0 0 0 S 1.000 0.000 0:01.36 kworker/u8:0 2482 root 20 0 0 0 0 S 1.000 0.000 0:00.86 jbd2/sda1-33 299 root 0 -20 0 0 0 S 0.333 0.000 0:00.13 kworker/2:1H 335 root 0 -20 0 0 0 S 0.333 0.000 0:00.17 kworker/1:1H 535 root 20 0 12140 7268 1456 S 0.333 0.355 0:00.34 haveged 1282 root rt 0 222284 123108 97224 S 0.333 6.017 0:01.33 corosync ocfs2test@tb-node2:~>multiple_run.sh -i ens3 -k ~/linux-4.4.21-69.tar.gz -o ~/ocfs2mullog -C hacluster -s pcmk -n tb-node2,tb-node1,tb-node3 -d /dev/sda1 -b 4096 -c 32768 -t multi_mmap /mnt/shared Tests with "-b 4096 -C 32768" Thu Dec 28 15:04:12 CST 2017 multi_mmap..................................................Passed. Runtime 487 seconds. Link: http://lkml.kernel.org/r/1514447305-30814-1-git-send-email-ghe@suse.com Fixes: 1cce4df04f37 ("ocfs2: do not lock/unlock() inode DLM lock") Signed-off-by: Gang He Reviewed-by: Eric Ren Acked-by: alex chen Acked-by: piaojun Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 4689940a953c..5193218f5889 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2486,6 +2486,15 @@ int ocfs2_inode_lock_with_page(struct inode *inode, ret = ocfs2_inode_lock_full(inode, ret_bh, ex, OCFS2_LOCK_NONBLOCK); if (ret == -EAGAIN) { unlock_page(page); + /* + * If we can't get inode lock immediately, we should not return + * directly here, since this will lead to a softlockup problem. + * The method is to get a blocking lock and immediately unlock + * before returning, this can avoid CPU resource waste due to + * lots of retries, and benefits fairness in getting lock. + */ + if (ocfs2_inode_lock(inode, ret_bh, ex) == 0) + ocfs2_inode_unlock(inode, ex); ret = AOP_TRUNCATED_PAGE; } From c0a1a6d769aedd23c80fc12721cc98b126bec91f Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 31 Jan 2018 16:14:51 -0800 Subject: [PATCH 013/118] ocfs2/xattr: assign errno to 'ret' in ocfs2_calc_xattr_init() We need catch the errno returned by ocfs2_xattr_get_nolock() and assign it to 'ret' for printing and noticing upper callers. Link: http://lkml.kernel.org/r/5A571CAF.8050709@huawei.com Signed-off-by: Jun Piao Reviewed-by: Alex Chen Reviewed-by: Yiwen Jiang Acked-by: Gang He Acked-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/xattr.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 2423e905ec1a..268619c96b4e 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -646,6 +646,7 @@ int ocfs2_calc_xattr_init(struct inode *dir, if (S_ISDIR(mode)) a_size <<= 1; } else if (acl_len != 0 && acl_len != -ENODATA) { + ret = acl_len; mlog_errno(ret); return ret; } From d22aa61549036c1dfb47c04c0766630b128efe40 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:14:55 -0800 Subject: [PATCH 014/118] ocfs2: clean up dead code in alloc.c Some stack variables are no longer used but still assigned. Trim them. Link: http://lkml.kernel.org/r/1516105069-12643-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Reviewed-by: Jun Piao Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 11 ++--------- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index ab5105f9767e..134f3b5c25b0 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -2598,11 +2598,8 @@ static void ocfs2_unlink_subtree(handle_t *handle, int i; struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; - struct ocfs2_extent_list *el; struct ocfs2_extent_block *eb; - el = path_leaf_el(left_path); - eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) @@ -3938,7 +3935,7 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, struct ocfs2_path *path, struct ocfs2_extent_rec *insert_rec) { - int ret, i, next_free; + int i, next_free; struct buffer_head *bh; struct ocfs2_extent_list *el; struct ocfs2_extent_rec *rec; @@ -3955,7 +3952,6 @@ static void ocfs2_adjust_rightmost_records(handle_t *handle, ocfs2_error(ocfs2_metadata_cache_get_super(et->et_ci), "Owner %llu has a bad extent list\n", (unsigned long long)ocfs2_metadata_cache_owner(et->et_ci)); - ret = -EIO; return; } @@ -5057,7 +5053,6 @@ int ocfs2_split_extent(handle_t *handle, struct buffer_head *last_eb_bh = NULL; struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; struct ocfs2_merge_ctxt ctxt; - struct ocfs2_extent_list *rightmost_el; if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < @@ -5093,9 +5088,7 @@ int ocfs2_split_extent(handle_t *handle, } eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; - rightmost_el = &eb->h_list; - } else - rightmost_el = path_root_el(path); + } if (rec->e_cpos == split_rec->e_cpos && rec->e_leaf_clusters == split_rec->e_leaf_clusters) From 16c8d569f5704a84164f30ff01b29879f3438065 Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 31 Jan 2018 16:14:59 -0800 Subject: [PATCH 015/118] ocfs2/acl: use 'ip_xattr_sem' to protect getting extended attribute The race between *set_acl and *get_acl will cause getting incomplete xattr data as below: processA processB ocfs2_set_acl ocfs2_xattr_set __ocfs2_xattr_set_handle ocfs2_get_acl_nolock ocfs2_xattr_get_nolock: processB may get incomplete xattr data if processA hasn't set_acl done. So we should use 'ip_xattr_sem' to protect getting extended attribute in ocfs2_get_acl_nolock(), as other processes could be changing it concurrently. Link: http://lkml.kernel.org/r/5A5DDCFF.7030001@huawei.com Signed-off-by: Jun Piao Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/acl.c | 6 ++++++ fs/ocfs2/xattr.c | 2 ++ 2 files changed, 8 insertions(+) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 40b5cc97f7b0..917fadca8a7b 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -311,7 +311,9 @@ struct posix_acl *ocfs2_iop_get_acl(struct inode *inode, int type) if (had_lock < 0) return ERR_PTR(had_lock); + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, type, di_bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); ocfs2_inode_unlock_tracker(inode, 0, &oh, had_lock); brelse(di_bh); @@ -330,7 +332,9 @@ int ocfs2_acl_chmod(struct inode *inode, struct buffer_head *bh) if (!(osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL)) return 0; + down_read(&OCFS2_I(inode)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(inode, ACL_TYPE_ACCESS, bh); + up_read(&OCFS2_I(inode)->ip_xattr_sem); if (IS_ERR(acl) || !acl) return PTR_ERR(acl); ret = __posix_acl_chmod(&acl, GFP_KERNEL, inode->i_mode); @@ -361,8 +365,10 @@ int ocfs2_init_acl(handle_t *handle, if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl = ocfs2_get_acl_nolock(dir, ACL_TYPE_DEFAULT, dir_bh); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (IS_ERR(acl)) return PTR_ERR(acl); } diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 268619c96b4e..c261c1dfd374 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -638,9 +638,11 @@ int ocfs2_calc_xattr_init(struct inode *dir, si->value_len); if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { + down_read(&OCFS2_I(dir)->ip_xattr_sem); acl_len = ocfs2_xattr_get_nolock(dir, dir_bh, OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT, "", NULL, 0); + up_read(&OCFS2_I(dir)->ip_xattr_sem); if (acl_len > 0) { a_size = ocfs2_xattr_entry_real_size(0, acl_len); if (S_ISDIR(mode)) From 63de8bd9328bf2a778fc277503da163ae3defa3c Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:15:02 -0800 Subject: [PATCH 016/118] ocfs2: make metadata estimation accurate and clear Current code assume that ::w_unwritten_list always has only one item on. This is not right and hard to get understood. So improve how to count unwritten item. Link: http://lkml.kernel.org/r/1515479070-32653-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Reported-by: John Lightsey Tested-by: John Lightsey Cc: Mark Fasheh Cc: Joseph Qi Cc: Junxiao Bi Cc: Joel Becker Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/aops.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index d1516327b787..256986aca8df 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -797,6 +797,7 @@ struct ocfs2_write_ctxt { struct ocfs2_cached_dealloc_ctxt w_dealloc; struct list_head w_unwritten_list; + unsigned int w_unwritten_count; }; void ocfs2_unlock_and_free_pages(struct page **pages, int num_pages) @@ -1386,6 +1387,7 @@ retry: desc->c_clear_unwritten = 0; list_add_tail(&new->ue_ip_node, &oi->ip_unwritten_list); list_add_tail(&new->ue_node, &wc->w_unwritten_list); + wc->w_unwritten_count++; new = NULL; unlock: spin_unlock(&oi->ip_lock); @@ -2256,7 +2258,7 @@ static int ocfs2_dio_wr_get_block(struct inode *inode, sector_t iblock, ue->ue_phys = desc->c_phys; list_splice_tail_init(&wc->w_unwritten_list, &dwc->dw_zero_list); - dwc->dw_zero_count++; + dwc->dw_zero_count += wc->w_unwritten_count; } ret = ocfs2_write_end_nolock(inode->i_mapping, pos, len, len, wc); From 71a36944042b7d9dd71f6a5d1c5ea1c2353b5d42 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:15:06 -0800 Subject: [PATCH 017/118] ocfs2: try to reuse extent block in dealloc without meta_alloc A crash issue was reported by John Lightsey with a call trace as follows: ocfs2_split_extent+0x1ad3/0x1b40 [ocfs2] ocfs2_change_extent_flag+0x33a/0x470 [ocfs2] ocfs2_mark_extent_written+0x172/0x220 [ocfs2] ocfs2_dio_end_io+0x62d/0x910 [ocfs2] dio_complete+0x19a/0x1a0 do_blockdev_direct_IO+0x19dd/0x1eb0 __blockdev_direct_IO+0x43/0x50 ocfs2_direct_IO+0x8f/0xa0 [ocfs2] generic_file_direct_write+0xb2/0x170 __generic_file_write_iter+0xc3/0x1b0 ocfs2_file_write_iter+0x4bb/0xca0 [ocfs2] __vfs_write+0xae/0xf0 vfs_write+0xb8/0x1b0 SyS_write+0x4f/0xb0 system_call_fastpath+0x16/0x75 The BUG code told that extent tree wants to grow but no metadata was reserved ahead of time. From my investigation into this issue, the root cause it that although enough metadata is not reserved, there should be enough for following use. Rightmost extent is merged into its left one due to a certain times of marking extent written. Because during marking extent written, we got many physically continuous extents. At last, an empty extent showed up and the rightmost path is removed from extent tree. Add a new mechanism to reuse extent block cached in dealloc which were just unlinked from extent tree to solve this crash issue. Criteria is that during marking extents *written*, if extent rotation and merging results in unlinking extent with growing extent tree later without any metadata reserved ahead of time, try to reuse those extents in dealloc in which deleted extents are cached. Also, this patch addresses the issue John reported that ::dw_zero_count is not calculated properly. After applying this patch, the issue John reported was gone. Thanks for the reproducer provided by John. And this patch has passed ocfs2-test(29 cases) suite running by New H3C Group. [ge.changwei@h3c.com: fix static checker warnning] Link: http://lkml.kernel.org/r/63ADC13FD55D6546B7DECE290D39E373F29196AE@H3CMLB12-EX.srv.huawei-3com.com [akpm@linux-foundation.org: brelse(NULL) is legal] Link: http://lkml.kernel.org/r/1515479070-32653-2-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Reported-by: John Lightsey Tested-by: John Lightsey Cc: Joel Becker Cc: Joseph Qi Cc: Junxiao Bi Cc: Dan Carpenter Cc: Mark Fasheh Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 206 ++++++++++++++++++++++++++++++++++++++++++++--- fs/ocfs2/alloc.h | 1 + fs/ocfs2/aops.c | 6 ++ 3 files changed, 203 insertions(+), 10 deletions(-) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 134f3b5c25b0..b3321de88d2b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -165,6 +165,13 @@ static int ocfs2_dinode_insert_check(struct ocfs2_extent_tree *et, struct ocfs2_extent_rec *rec); static int ocfs2_dinode_sanity_check(struct ocfs2_extent_tree *et); static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); + +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given); +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et); + static const struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, @@ -448,6 +455,7 @@ static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, if (!obj) obj = (void *)bh->b_data; et->et_object = obj; + et->et_dealloc = NULL; et->et_ops->eo_fill_root_el(et); if (!et->et_ops->eo_fill_max_leaf_clusters) @@ -1158,7 +1166,7 @@ static int ocfs2_add_branch(handle_t *handle, struct buffer_head **last_eb_bh, struct ocfs2_alloc_context *meta_ac) { - int status, new_blocks, i; + int status, new_blocks, i, block_given = 0; u64 next_blkno, new_last_eb_blk; struct buffer_head *bh; struct buffer_head **new_eb_bhs = NULL; @@ -1213,11 +1221,31 @@ static int ocfs2_add_branch(handle_t *handle, goto bail; } - status = ocfs2_create_new_meta_bhs(handle, et, new_blocks, - meta_ac, new_eb_bhs); - if (status < 0) { - mlog_errno(status); - goto bail; + /* Firstyly, try to reuse dealloc since we have already estimated how + * many extent blocks we may use. + */ + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + new_eb_bhs, new_blocks, + &block_given); + if (status < 0) { + mlog_errno(status); + goto bail; + } + } + + BUG_ON(block_given > new_blocks); + + if (block_given < new_blocks) { + BUG_ON(!meta_ac); + status = ocfs2_create_new_meta_bhs(handle, et, + new_blocks - block_given, + meta_ac, + &new_eb_bhs[block_given]); + if (status < 0) { + mlog_errno(status); + goto bail; + } } /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be @@ -1340,15 +1368,25 @@ static int ocfs2_shift_tree_depth(handle_t *handle, struct ocfs2_alloc_context *meta_ac, struct buffer_head **ret_new_eb_bh) { - int status, i; + int status, i, block_given = 0; u32 new_clusters; struct buffer_head *new_eb_bh = NULL; struct ocfs2_extent_block *eb; struct ocfs2_extent_list *root_el; struct ocfs2_extent_list *eb_el; - status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, - &new_eb_bh); + if (!ocfs2_is_dealloc_empty(et)) { + status = ocfs2_reuse_blk_from_dealloc(handle, et, + &new_eb_bh, 1, + &block_given); + } else if (meta_ac) { + status = ocfs2_create_new_meta_bhs(handle, et, 1, meta_ac, + &new_eb_bh); + + } else { + BUG(); + } + if (status < 0) { mlog_errno(status); goto bail; @@ -1511,7 +1549,7 @@ static int ocfs2_grow_tree(handle_t *handle, struct ocfs2_extent_tree *et, int depth = le16_to_cpu(el->l_tree_depth); struct buffer_head *bh = NULL; - BUG_ON(meta_ac == NULL); + BUG_ON(meta_ac == NULL && ocfs2_is_dealloc_empty(et)); shift = ocfs2_find_branch_target(et, &bh); if (shift < 0) { @@ -6578,6 +6616,154 @@ ocfs2_find_per_slot_free_list(int type, return fl; } +static struct ocfs2_per_slot_free_list * +ocfs2_find_preferred_free_list(int type, + int preferred_slot, + int *real_slot, + struct ocfs2_cached_dealloc_ctxt *ctxt) +{ + struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; + + while (fl) { + if (fl->f_inode_type == type && fl->f_slot == preferred_slot) { + *real_slot = fl->f_slot; + return fl; + } + + fl = fl->f_next_suballocator; + } + + /* If we can't find any free list matching preferred slot, just use + * the first one. + */ + fl = ctxt->c_first_suballocator; + *real_slot = fl->f_slot; + + return fl; +} + +/* Return Value 1 indicates empty */ +static int ocfs2_is_dealloc_empty(struct ocfs2_extent_tree *et) +{ + struct ocfs2_per_slot_free_list *fl = NULL; + + if (!et->et_dealloc) + return 1; + + fl = et->et_dealloc->c_first_suballocator; + if (!fl) + return 1; + + if (!fl->f_first) + return 1; + + return 0; +} + +/* If extent was deleted from tree due to extent rotation and merging, and + * no metadata is reserved ahead of time. Try to reuse some extents + * just deleted. This is only used to reuse extent blocks. + * It is supposed to find enough extent blocks in dealloc if our estimation + * on metadata is accurate. + */ +static int ocfs2_reuse_blk_from_dealloc(handle_t *handle, + struct ocfs2_extent_tree *et, + struct buffer_head **new_eb_bh, + int blk_wanted, int *blk_given) +{ + int i, status = 0, real_slot; + struct ocfs2_cached_dealloc_ctxt *dealloc; + struct ocfs2_per_slot_free_list *fl; + struct ocfs2_cached_block_free *bf; + struct ocfs2_extent_block *eb; + struct ocfs2_super *osb = + OCFS2_SB(ocfs2_metadata_cache_get_super(et->et_ci)); + + *blk_given = 0; + + /* If extent tree doesn't have a dealloc, this is not faulty. Just + * tell upper caller dealloc can't provide any block and it should + * ask for alloc to claim more space. + */ + dealloc = et->et_dealloc; + if (!dealloc) + goto bail; + + for (i = 0; i < blk_wanted; i++) { + /* Prefer to use local slot */ + fl = ocfs2_find_preferred_free_list(EXTENT_ALLOC_SYSTEM_INODE, + osb->slot_num, &real_slot, + dealloc); + /* If no more block can be reused, we should claim more + * from alloc. Just return here normally. + */ + if (!fl) { + status = 0; + break; + } + + bf = fl->f_first; + fl->f_first = bf->free_next; + + new_eb_bh[i] = sb_getblk(osb->sb, bf->free_blk); + if (new_eb_bh[i] == NULL) { + status = -ENOMEM; + mlog_errno(status); + goto bail; + } + + mlog(0, "Reusing block(%llu) from " + "dealloc(local slot:%d, real slot:%d)\n", + bf->free_blk, osb->slot_num, real_slot); + + ocfs2_set_new_buffer_uptodate(et->et_ci, new_eb_bh[i]); + + status = ocfs2_journal_access_eb(handle, et->et_ci, + new_eb_bh[i], + OCFS2_JOURNAL_ACCESS_CREATE); + if (status < 0) { + mlog_errno(status); + goto bail; + } + + memset(new_eb_bh[i]->b_data, 0, osb->sb->s_blocksize); + eb = (struct ocfs2_extent_block *) new_eb_bh[i]->b_data; + + /* We can't guarantee that buffer head is still cached, so + * polutlate the extent block again. + */ + strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); + eb->h_blkno = cpu_to_le64(bf->free_blk); + eb->h_fs_generation = cpu_to_le32(osb->fs_generation); + eb->h_suballoc_slot = cpu_to_le16(real_slot); + eb->h_suballoc_loc = cpu_to_le64(bf->free_bg); + eb->h_suballoc_bit = cpu_to_le16(bf->free_bit); + eb->h_list.l_count = + cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); + + /* We'll also be dirtied by the caller, so + * this isn't absolutely necessary. + */ + ocfs2_journal_dirty(handle, new_eb_bh[i]); + + if (!fl->f_first) { + dealloc->c_first_suballocator = fl->f_next_suballocator; + kfree(fl); + } + kfree(bf); + } + + *blk_given = i; + +bail: + if (unlikely(status < 0)) { + for (i = 0; i < blk_wanted; i++) + brelse(new_eb_bh[i]); + } + + return status; +} + int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, int type, int slot, u64 suballoc, u64 blkno, unsigned int bit) diff --git a/fs/ocfs2/alloc.h b/fs/ocfs2/alloc.h index 27b75cf32cfa..250bcacdf9e9 100644 --- a/fs/ocfs2/alloc.h +++ b/fs/ocfs2/alloc.h @@ -61,6 +61,7 @@ struct ocfs2_extent_tree { ocfs2_journal_access_func et_root_journal_access; void *et_object; unsigned int et_max_leaf_clusters; + struct ocfs2_cached_dealloc_ctxt *et_dealloc; }; /* diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 256986aca8df..e8e205bf2e41 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2332,6 +2332,12 @@ static int ocfs2_dio_end_io_write(struct inode *inode, ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(inode), di_bh); + /* Attach dealloc with extent tree in case that we may reuse extents + * which are already unlinked from current extent tree due to extent + * rotation and merging. + */ + et.et_dealloc = &dealloc; + ret = ocfs2_lock_allocators(inode, &et, 0, dwc->dw_zero_count*2, &data_ac, &meta_ac); if (ret) { From 4882abebccb58d68056462b66cea0d7f16169c39 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:15:10 -0800 Subject: [PATCH 018/118] ocfs2: add trimfs dlm lock resource Introduce a new dlm lock resource, which will be used to communicate during fstrimming of an ocfs2 device from cluster nodes. Link: http://lkml.kernel.org/r/1513228484-2084-1-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 86 +++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/dlmglue.h | 29 ++++++++++++++ fs/ocfs2/ocfs2.h | 1 + fs/ocfs2/ocfs2_lockid.h | 5 +++ 4 files changed, 121 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 5193218f5889..f5643e3ff317 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -259,6 +259,10 @@ static struct ocfs2_lock_res_ops ocfs2_nfs_sync_lops = { .flags = 0, }; +static struct ocfs2_lock_res_ops ocfs2_trim_fs_lops = { + .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, +}; + static struct ocfs2_lock_res_ops ocfs2_orphan_scan_lops = { .flags = LOCK_TYPE_REQUIRES_REFRESH|LOCK_TYPE_USES_LVB, }; @@ -676,6 +680,24 @@ static void ocfs2_nfs_sync_lock_res_init(struct ocfs2_lock_res *res, &ocfs2_nfs_sync_lops, osb); } +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_lock_res_init_once(lockres); + ocfs2_build_lock_name(OCFS2_LOCK_TYPE_TRIM_FS, 0, 0, lockres->l_name); + ocfs2_lock_res_init_common(osb, lockres, OCFS2_LOCK_TYPE_TRIM_FS, + &ocfs2_trim_fs_lops, osb); +} + +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb) +{ + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + ocfs2_simple_drop_lockres(osb, lockres); + ocfs2_lock_res_free(lockres); +} + static void ocfs2_orphan_scan_lock_res_init(struct ocfs2_lock_res *res, struct ocfs2_super *osb) { @@ -2754,6 +2776,70 @@ void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex) ex ? LKM_EXMODE : LKM_PRMODE); } +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock) +{ + int status; + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (info) + info->tf_valid = 0; + + if (ocfs2_is_hard_readonly(osb)) + return -EROFS; + + if (ocfs2_mount_local(osb)) + return 0; + + status = ocfs2_cluster_lock(osb, lockres, DLM_LOCK_EX, + trylock ? DLM_LKF_NOQUEUE : 0, 0); + if (status < 0) { + if (status != -EAGAIN) + mlog_errno(status); + return status; + } + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + if (ocfs2_dlm_lvb_valid(&lockres->l_lksb) && + lvb->lvb_version == OCFS2_TRIMFS_LVB_VERSION) { + info->tf_valid = 1; + info->tf_success = lvb->lvb_success; + info->tf_nodenum = be32_to_cpu(lvb->lvb_nodenum); + info->tf_start = be64_to_cpu(lvb->lvb_start); + info->tf_len = be64_to_cpu(lvb->lvb_len); + info->tf_minlen = be64_to_cpu(lvb->lvb_minlen); + info->tf_trimlen = be64_to_cpu(lvb->lvb_trimlen); + } + } + + return status; +} + +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info) +{ + struct ocfs2_trim_fs_lvb *lvb; + struct ocfs2_lock_res *lockres = &osb->osb_trim_fs_lockres; + + if (ocfs2_mount_local(osb)) + return; + + if (info) { + lvb = ocfs2_dlm_lvb(&lockres->l_lksb); + lvb->lvb_version = OCFS2_TRIMFS_LVB_VERSION; + lvb->lvb_success = info->tf_success; + lvb->lvb_nodenum = cpu_to_be32(info->tf_nodenum); + lvb->lvb_start = cpu_to_be64(info->tf_start); + lvb->lvb_len = cpu_to_be64(info->tf_len); + lvb->lvb_minlen = cpu_to_be64(info->tf_minlen); + lvb->lvb_trimlen = cpu_to_be64(info->tf_trimlen); + } + + ocfs2_cluster_unlock(osb, lockres, DLM_LOCK_EX); +} + int ocfs2_dentry_lock(struct dentry *dentry, int ex) { int ret; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index a7fc18ba0dc1..2253688b0107 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -70,6 +70,29 @@ struct ocfs2_orphan_scan_lvb { __be32 lvb_os_seqno; }; +#define OCFS2_TRIMFS_LVB_VERSION 1 + +struct ocfs2_trim_fs_lvb { + __u8 lvb_version; + __u8 lvb_success; + __u8 lvb_reserved[2]; + __be32 lvb_nodenum; + __be64 lvb_start; + __be64 lvb_len; + __be64 lvb_minlen; + __be64 lvb_trimlen; +}; + +struct ocfs2_trim_fs_info { + u8 tf_valid; /* lvb is valid, or not */ + u8 tf_success; /* trim is successful, or not */ + u32 tf_nodenum; /* osb node number */ + u64 tf_start; /* trim start offset in clusters */ + u64 tf_len; /* trim end offset in clusters */ + u64 tf_minlen; /* trim minimum contiguous free clusters */ + u64 tf_trimlen; /* trimmed length in bytes */ +}; + struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; @@ -153,6 +176,12 @@ int ocfs2_rename_lock(struct ocfs2_super *osb); void ocfs2_rename_unlock(struct ocfs2_super *osb); int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex); void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex); +void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb); +void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb); +int ocfs2_trim_fs_lock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info, int trylock); +void ocfs2_trim_fs_unlock(struct ocfs2_super *osb, + struct ocfs2_trim_fs_info *info); int ocfs2_dentry_lock(struct dentry *dentry, int ex); void ocfs2_dentry_unlock(struct dentry *dentry, int ex); int ocfs2_file_lock(struct file *file, int ex, int trylock); diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 9a50f222ac97..6867eef2e06b 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -404,6 +404,7 @@ struct ocfs2_super struct ocfs2_lock_res osb_super_lockres; struct ocfs2_lock_res osb_rename_lockres; struct ocfs2_lock_res osb_nfs_sync_lockres; + struct ocfs2_lock_res osb_trim_fs_lockres; struct ocfs2_dlm_debug *osb_dlm_debug; struct dentry *osb_debug_root; diff --git a/fs/ocfs2/ocfs2_lockid.h b/fs/ocfs2/ocfs2_lockid.h index d277aabf5dfb..7051b994c776 100644 --- a/fs/ocfs2/ocfs2_lockid.h +++ b/fs/ocfs2/ocfs2_lockid.h @@ -50,6 +50,7 @@ enum ocfs2_lock_type { OCFS2_LOCK_TYPE_NFS_SYNC, OCFS2_LOCK_TYPE_ORPHAN_SCAN, OCFS2_LOCK_TYPE_REFCOUNT, + OCFS2_LOCK_TYPE_TRIM_FS, OCFS2_NUM_LOCK_TYPES }; @@ -93,6 +94,9 @@ static inline char ocfs2_lock_type_char(enum ocfs2_lock_type type) case OCFS2_LOCK_TYPE_REFCOUNT: c = 'T'; break; + case OCFS2_LOCK_TYPE_TRIM_FS: + c = 'I'; + break; default: c = '\0'; } @@ -115,6 +119,7 @@ static char *ocfs2_lock_type_strings[] = { [OCFS2_LOCK_TYPE_NFS_SYNC] = "NFSSync", [OCFS2_LOCK_TYPE_ORPHAN_SCAN] = "OrphanScan", [OCFS2_LOCK_TYPE_REFCOUNT] = "Refcount", + [OCFS2_LOCK_TYPE_TRIM_FS] = "TrimFs", }; static inline const char *ocfs2_lock_type_string(enum ocfs2_lock_type type) From 637dd20c490386c725ab21f3eb763a36fd0a5fb0 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:15:13 -0800 Subject: [PATCH 019/118] ocfs2: add trimfs lock to avoid duplicated trims in cluster ocfs2 supports trimming the underlying disk via the fstrim command. But there is a problem, ocfs2 is a shared disk cluster file system, if the user configures a scheduled fstrim job on each file system node, this will trigger multiple nodes trimming a shared disk simultaneously, which is very wasteful for CPU and IO consumption. This also might negatively affect the lifetime of poor-quality SSD devices. So we introduce a trimfs dlm lock to communicate with each other in this case, which will make only one fstrim command to do the trimming on a shared disk among the cluster. The fstrim commands from the other nodes should wait for the first fstrim to finish and return success directly, to avoid running the same trim on the shared disk again. Link: http://lkml.kernel.org/r/1513228484-2084-2-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/alloc.c | 44 ++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index b3321de88d2b..9a876bb07cac 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -7561,6 +7561,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) struct buffer_head *gd_bh = NULL; struct ocfs2_dinode *main_bm; struct ocfs2_group_desc *gd = NULL; + struct ocfs2_trim_fs_info info, *pinfo = NULL; start = range->start >> osb->s_clustersize_bits; len = range->len >> osb->s_clustersize_bits; @@ -7598,6 +7599,42 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) trace_ocfs2_trim_fs(start, len, minlen); + ocfs2_trim_fs_lock_res_init(osb); + ret = ocfs2_trim_fs_lock(osb, NULL, 1); + if (ret < 0) { + if (ret != -EAGAIN) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + mlog(ML_NOTICE, "Wait for trim on device (%s) to " + "finish, which is running from another node.\n", + osb->dev_str); + ret = ocfs2_trim_fs_lock(osb, &info, 0); + if (ret < 0) { + mlog_errno(ret); + ocfs2_trim_fs_lock_res_uninit(osb); + goto out_unlock; + } + + if (info.tf_valid && info.tf_success && + info.tf_start == start && info.tf_len == len && + info.tf_minlen == minlen) { + /* Avoid sending duplicated trim to a shared device */ + mlog(ML_NOTICE, "The same trim on device (%s) was " + "just done from node (%u), return.\n", + osb->dev_str, info.tf_nodenum); + range->len = info.tf_trimlen; + goto out_trimunlock; + } + } + + info.tf_nodenum = osb->node_num; + info.tf_start = start; + info.tf_len = len; + info.tf_minlen = minlen; + /* Determine first and last group to examine based on start and len */ first_group = ocfs2_which_cluster_group(main_bm_inode, start); if (first_group == osb->first_cluster_group_blkno) @@ -7642,6 +7679,13 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) group += ocfs2_clusters_to_blocks(sb, osb->bitmap_cpg); } range->len = trimmed * sb->s_blocksize; + + info.tf_trimlen = range->len; + info.tf_success = (ret ? 0 : 1); + pinfo = &info; +out_trimunlock: + ocfs2_trim_fs_unlock(osb, pinfo); + ocfs2_trim_fs_lock_res_uninit(osb); out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); From 06e7f13d192ba9d6806f6caaf58f88b1b0b57134 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:15:17 -0800 Subject: [PATCH 020/118] ocfs2: add ocfs2_try_rw_lock() and ocfs2_try_inode_lock() Patch series "ocfs2: add nowait aio support", v4. VFS layer has introduced the non-blocking aio flag IOCB_NOWAIT, which tells the kernel to bail out if an AIO request will block for reasons such as file allocations, or writeback triggering, or would block while allocating requests while performing direct I/O. Subsequently, pwritev2/preadv2 also can leverage this part of kernel code. So far, ext4/xfs/btrfs have supported this feature. Add the related code for the ocfs2 file system. This patch (of 3): Add ocfs2_try_rw_lock and ocfs2_try_inode_lock functions, which will be used in non-blocking IO scenarios. [ghe@suse.com: v2] Link: http://lkml.kernel.org/r/1511944612-9629-2-git-send-email-ghe@suse.com Link: http://lkml.kernel.org/r/1511775987-841-2-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Jun Piao Acked-by: alex chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 21 +++++++++++++++++++++ fs/ocfs2/dlmglue.h | 4 ++++ 2 files changed, 25 insertions(+) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f5643e3ff317..13fa809f4885 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1764,6 +1764,27 @@ int ocfs2_rw_lock(struct inode *inode, int write) return status; } +int ocfs2_try_rw_lock(struct inode *inode, int write) +{ + int status, level; + struct ocfs2_lock_res *lockres; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + + mlog(0, "inode %llu try to take %s RW lock\n", + (unsigned long long)OCFS2_I(inode)->ip_blkno, + write ? "EXMODE" : "PRMODE"); + + if (ocfs2_mount_local(osb)) + return 0; + + lockres = &OCFS2_I(inode)->ip_rw_lockres; + + level = write ? DLM_LOCK_EX : DLM_LOCK_PR; + + status = ocfs2_cluster_lock(osb, lockres, level, DLM_LKF_NOQUEUE, 0); + return status; +} + void ocfs2_rw_unlock(struct inode *inode, int write) { int level = write ? DLM_LOCK_EX : DLM_LOCK_PR; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 2253688b0107..34139a3d7118 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -139,6 +139,7 @@ void ocfs2_lock_res_free(struct ocfs2_lock_res *res); int ocfs2_create_new_inode_locks(struct inode *inode); int ocfs2_drop_inode_locks(struct inode *inode); int ocfs2_rw_lock(struct inode *inode, int write); +int ocfs2_try_rw_lock(struct inode *inode, int write); void ocfs2_rw_unlock(struct inode *inode, int write); int ocfs2_open_lock(struct inode *inode); int ocfs2_try_open_lock(struct inode *inode, int write); @@ -163,6 +164,9 @@ int ocfs2_inode_lock_with_page(struct inode *inode, /* 99% of the time we don't want to supply any additional flags -- * those are for very specific cases only. */ #define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL) +#define ocfs2_try_inode_lock(i, b, e)\ + ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\ + OI_LS_NORMAL) void ocfs2_inode_unlock(struct inode *inode, int ex); int ocfs2_super_lock(struct ocfs2_super *osb, From ac604d3cdb20a12d67131d20095c4c7905aeb722 Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:15:21 -0800 Subject: [PATCH 021/118] ocfs2: add ocfs2_overwrite_io() Add ocfs2_overwrite_io function, which is used to judge if overwrite allocated blocks, otherwise, the write will bring extra block allocation overhead. [ghe@suse.com: v3] Link: http://lkml.kernel.org/r/1514455665-16325-3-git-send-email-ghe@suse.com [ghe@suse.com: v2] Link: http://lkml.kernel.org/r/1511944612-9629-3-git-send-email-ghe@suse.com Link: http://lkml.kernel.org/r/1511775987-841-3-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Jun Piao Cc: alex chen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/extent_map.c | 45 +++++++++++++++++++++++++++++++++++++++++++ fs/ocfs2/extent_map.h | 3 +++ 2 files changed, 48 insertions(+) diff --git a/fs/ocfs2/extent_map.c b/fs/ocfs2/extent_map.c index e4719e0a3f99..06cb96462bf9 100644 --- a/fs/ocfs2/extent_map.c +++ b/fs/ocfs2/extent_map.c @@ -38,6 +38,7 @@ #include "inode.h" #include "super.h" #include "symlink.h" +#include "aops.h" #include "ocfs2_trace.h" #include "buffer_head_io.h" @@ -832,6 +833,50 @@ out: return ret; } +/* Is IO overwriting allocated blocks? */ +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len) +{ + int ret = 0, is_last; + u32 mapping_end, cpos; + struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); + struct ocfs2_extent_rec rec; + + if (OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) { + if (ocfs2_size_fits_inline_data(di_bh, map_start + map_len)) + return ret; + else + return -EAGAIN; + } + + cpos = map_start >> osb->s_clustersize_bits; + mapping_end = ocfs2_clusters_for_bytes(inode->i_sb, + map_start + map_len); + is_last = 0; + while (cpos < mapping_end && !is_last) { + ret = ocfs2_get_clusters_nocache(inode, di_bh, cpos, + NULL, &rec, &is_last); + if (ret) { + mlog_errno(ret); + goto out; + } + + if (rec.e_blkno == 0ULL) + break; + + if (rec.e_flags & OCFS2_EXT_REFCOUNTED) + break; + + cpos = le32_to_cpu(rec.e_cpos) + + le16_to_cpu(rec.e_leaf_clusters); + } + + if (cpos < mapping_end) + ret = -EAGAIN; +out: + return ret; +} + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int whence) { struct inode *inode = file->f_mapping->host; diff --git a/fs/ocfs2/extent_map.h b/fs/ocfs2/extent_map.h index 67ea57d2fd59..1057586ec19f 100644 --- a/fs/ocfs2/extent_map.h +++ b/fs/ocfs2/extent_map.h @@ -53,6 +53,9 @@ int ocfs2_extent_map_get_blocks(struct inode *inode, u64 v_blkno, u64 *p_blkno, int ocfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 map_start, u64 map_len); +int ocfs2_overwrite_io(struct inode *inode, struct buffer_head *di_bh, + u64 map_start, u64 map_len); + int ocfs2_seek_data_hole_offset(struct file *file, loff_t *offset, int origin); int ocfs2_xattr_get_clusters(struct inode *inode, u32 v_cluster, From c4c2416ab0d656539cca5de4ae0a2ba8ec3d9eca Mon Sep 17 00:00:00 2001 From: Gang He Date: Wed, 31 Jan 2018 16:15:25 -0800 Subject: [PATCH 022/118] ocfs2: nowait aio support Return EAGAIN if any of the following checks fail for direct I/O: - Cannot get the related locks immediately - Blocks are not allocated at the write location, it will trigger block allocation and block IO operations. [ghe@suse.com: v4] Link: http://lkml.kernel.org/r/1516007283-29932-4-git-send-email-ghe@suse.com [ghe@suse.com: v2] Link: http://lkml.kernel.org/r/1511944612-9629-4-git-send-email-ghe@suse.com Link: http://lkml.kernel.org/r/1511775987-841-4-git-send-email-ghe@suse.com Signed-off-by: Gang He Reviewed-by: Alex Chen Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Cc: Jun Piao Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dir.c | 2 +- fs/ocfs2/dlmglue.c | 20 ++++++-- fs/ocfs2/dlmglue.h | 2 +- fs/ocfs2/file.c | 101 ++++++++++++++++++++++++++++++++--------- fs/ocfs2/mmap.c | 2 +- fs/ocfs2/ocfs2_trace.h | 10 ++-- 6 files changed, 104 insertions(+), 33 deletions(-) diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index 32f9c72dff17..b7520e20a770 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -1958,7 +1958,7 @@ int ocfs2_readdir(struct file *file, struct dir_context *ctx) trace_ocfs2_readdir((unsigned long long)OCFS2_I(inode)->ip_blkno); - error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level); + error = ocfs2_inode_lock_atime(inode, file->f_path.mnt, &lock_level, 1); if (lock_level && error >= 0) { /* We release EX lock which used to update atime * and get PR lock again to reduce contention diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 13fa809f4885..9479f99c2145 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -2546,13 +2546,18 @@ int ocfs2_inode_lock_with_page(struct inode *inode, int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level) + int *level, int wait) { int ret; - ret = ocfs2_inode_lock(inode, NULL, 0); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, 0); + else + ret = ocfs2_try_inode_lock(inode, NULL, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } @@ -2564,9 +2569,14 @@ int ocfs2_inode_lock_atime(struct inode *inode, struct buffer_head *bh = NULL; ocfs2_inode_unlock(inode, 0); - ret = ocfs2_inode_lock(inode, &bh, 1); + if (wait) + ret = ocfs2_inode_lock(inode, &bh, 1); + else + ret = ocfs2_try_inode_lock(inode, &bh, 1); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); return ret; } *level = 1; diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 34139a3d7118..256e0a9067b8 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -146,7 +146,7 @@ int ocfs2_try_open_lock(struct inode *inode, int write); void ocfs2_open_unlock(struct inode *inode); int ocfs2_inode_lock_atime(struct inode *inode, struct vfsmount *vfsmnt, - int *level); + int *level, int wait); int ocfs2_inode_lock_full_nested(struct inode *inode, struct buffer_head **ret_bh, int ex, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index a1d051055472..5d1784a365a3 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -140,6 +140,8 @@ static int ocfs2_file_open(struct inode *inode, struct file *file) spin_unlock(&oi->ip_lock); } + file->f_mode |= FMODE_NOWAIT; + leave: return status; } @@ -2132,12 +2134,12 @@ out: } static int ocfs2_prepare_inode_for_write(struct file *file, - loff_t pos, - size_t count) + loff_t pos, size_t count, int wait) { - int ret = 0, meta_level = 0; + int ret = 0, meta_level = 0, overwrite_io = 0; struct dentry *dentry = file->f_path.dentry; struct inode *inode = d_inode(dentry); + struct buffer_head *di_bh = NULL; loff_t end; /* @@ -2145,13 +2147,40 @@ static int ocfs2_prepare_inode_for_write(struct file *file, * if we need to make modifications here. */ for(;;) { - ret = ocfs2_inode_lock(inode, NULL, meta_level); + if (wait) + ret = ocfs2_inode_lock(inode, NULL, meta_level); + else + ret = ocfs2_try_inode_lock(inode, + overwrite_io ? NULL : &di_bh, meta_level); if (ret < 0) { meta_level = -1; - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } + /* + * Check if IO will overwrite allocated blocks in case + * IOCB_NOWAIT flag is set. + */ + if (!wait && !overwrite_io) { + overwrite_io = 1; + if (!down_read_trylock(&OCFS2_I(inode)->ip_alloc_sem)) { + ret = -EAGAIN; + goto out_unlock; + } + + ret = ocfs2_overwrite_io(inode, di_bh, pos, count); + brelse(di_bh); + di_bh = NULL; + up_read(&OCFS2_I(inode)->ip_alloc_sem); + if (ret < 0) { + if (ret != -EAGAIN) + mlog_errno(ret); + goto out_unlock; + } + } + /* Clear suid / sgid if necessary. We do this here * instead of later in the write path because * remove_suid() calls ->setattr without any hint that @@ -2199,7 +2228,9 @@ static int ocfs2_prepare_inode_for_write(struct file *file, out_unlock: trace_ocfs2_prepare_inode_for_write(OCFS2_I(inode)->ip_blkno, - pos, count); + pos, count, wait); + + brelse(di_bh); if (meta_level >= 0) ocfs2_inode_unlock(inode, meta_level); @@ -2211,7 +2242,7 @@ out: static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { - int direct_io, rw_level; + int rw_level; ssize_t written = 0; ssize_t ret; size_t count = iov_iter_count(from); @@ -2223,6 +2254,8 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, void *saved_ki_complete = NULL; int append_write = ((iocb->ki_pos + count) >= i_size_read(inode) ? 1 : 0); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_write(inode, file, file->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2230,12 +2263,17 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, file->f_path.dentry->d_name.name, (unsigned int)from->nr_segs); /* GRRRRR */ + if (!direct_io && nowait) + return -EOPNOTSUPP; + if (count == 0) return 0; - direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - - inode_lock(inode); + if (nowait) { + if (!inode_trylock(inode)) + return -EAGAIN; + } else + inode_lock(inode); /* * Concurrent O_DIRECT writes are allowed with @@ -2244,9 +2282,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, */ rw_level = (!direct_io || full_coherency || append_write); - ret = ocfs2_rw_lock(inode, rw_level); + if (nowait) + ret = ocfs2_try_rw_lock(inode, rw_level); + else + ret = ocfs2_rw_lock(inode, rw_level); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out_mutex; } @@ -2260,9 +2302,13 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, * other nodes to drop their caches. Buffered I/O * already does this in write_begin(). */ - ret = ocfs2_inode_lock(inode, NULL, 1); + if (nowait) + ret = ocfs2_try_inode_lock(inode, NULL, 1); + else + ret = ocfs2_inode_lock(inode, NULL, 1); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2277,9 +2323,10 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, } count = ret; - ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count); + ret = ocfs2_prepare_inode_for_write(file, iocb->ki_pos, count, !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto out; } @@ -2355,6 +2402,8 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, int ret = 0, rw_level = -1, lock_level = 0; struct file *filp = iocb->ki_filp; struct inode *inode = file_inode(filp); + int direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; + int nowait = iocb->ki_flags & IOCB_NOWAIT ? 1 : 0; trace_ocfs2_file_aio_read(inode, filp, filp->f_path.dentry, (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -2369,14 +2418,22 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, goto bail; } + if (!direct_io && nowait) + return -EOPNOTSUPP; + /* * buffered reads protect themselves in ->readpage(). O_DIRECT reads * need locks to protect pending reads from racing with truncate. */ - if (iocb->ki_flags & IOCB_DIRECT) { - ret = ocfs2_rw_lock(inode, 0); + if (direct_io) { + if (nowait) + ret = ocfs2_try_rw_lock(inode, 0); + else + ret = ocfs2_rw_lock(inode, 0); + if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } rw_level = 0; @@ -2393,9 +2450,11 @@ static ssize_t ocfs2_file_read_iter(struct kiocb *iocb, * like i_size. This allows the checks down below * generic_file_aio_read() a chance of actually working. */ - ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level); + ret = ocfs2_inode_lock_atime(inode, filp->f_path.mnt, &lock_level, + !nowait); if (ret < 0) { - mlog_errno(ret); + if (ret != -EAGAIN) + mlog_errno(ret); goto bail; } ocfs2_inode_unlock(inode, lock_level); diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index 098f5c712569..fb9a20e3d608 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -184,7 +184,7 @@ int ocfs2_mmap(struct file *file, struct vm_area_struct *vma) int ret = 0, lock_level = 0; ret = ocfs2_inode_lock_atime(file_inode(file), - file->f_path.mnt, &lock_level); + file->f_path.mnt, &lock_level, 1); if (ret < 0) { mlog_errno(ret); goto out; diff --git a/fs/ocfs2/ocfs2_trace.h b/fs/ocfs2/ocfs2_trace.h index a0b5d00ef0a9..e2a11aaece10 100644 --- a/fs/ocfs2/ocfs2_trace.h +++ b/fs/ocfs2/ocfs2_trace.h @@ -1449,20 +1449,22 @@ DEFINE_OCFS2_ULL_ULL_ULL_EVENT(ocfs2_remove_inode_range); TRACE_EVENT(ocfs2_prepare_inode_for_write, TP_PROTO(unsigned long long ino, unsigned long long saved_pos, - unsigned long count), - TP_ARGS(ino, saved_pos, count), + unsigned long count, int wait), + TP_ARGS(ino, saved_pos, count, wait), TP_STRUCT__entry( __field(unsigned long long, ino) __field(unsigned long long, saved_pos) __field(unsigned long, count) + __field(int, wait) ), TP_fast_assign( __entry->ino = ino; __entry->saved_pos = saved_pos; __entry->count = count; + __entry->wait = wait; ), - TP_printk("%llu %llu %lu", __entry->ino, - __entry->saved_pos, __entry->count) + TP_printk("%llu %llu %lu %d", __entry->ino, + __entry->saved_pos, __entry->count, __entry->wait) ); DEFINE_OCFS2_INT_EVENT(generic_file_aio_read_ret); From e75ed71be4f2f7508bd7d1d993d34095a10ca447 Mon Sep 17 00:00:00 2001 From: Changwei Ge Date: Wed, 31 Jan 2018 16:15:29 -0800 Subject: [PATCH 023/118] ocfs2: unlock bh_state if bg check fails We should unlock bh_stat if bg->bg_free_bits_count > bg->bg_bits Link: http://lkml.kernel.org/r/1516843095-23680-1-git-send-email-ge.changwei@h3c.com Signed-off-by: Changwei Ge Suggested-by: Jan Kara Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/suballoc.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 2d8d31c85f45..d8f5f6ce99dc 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -2438,6 +2438,8 @@ static int ocfs2_block_group_clear_bits(handle_t *handle, } le16_add_cpu(&bg->bg_free_bits_count, num_bits); if (le16_to_cpu(bg->bg_free_bits_count) > le16_to_cpu(bg->bg_bits)) { + if (undo_fn) + jbd_unlock_bh_state(group_bh); return ocfs2_error(alloc_inode->i_sb, "Group descriptor # %llu has bit count %u but claims %u are freed. num_bits %d\n", (unsigned long long)le64_to_cpu(bg->bg_blkno), le16_to_cpu(bg->bg_bits), From d984187e3a1ad7d12447a7ab2c43ce3717a2b5b3 Mon Sep 17 00:00:00 2001 From: piaojun Date: Wed, 31 Jan 2018 16:15:32 -0800 Subject: [PATCH 024/118] ocfs2: return error when we attempt to access a dirty bh in jbd2 We should not reuse the dirty bh in jbd2 directly due to the following situation: 1. When removing extent rec, we will dirty the bhs of extent rec and truncate log at the same time, and hand them over to jbd2. 2. The bhs are submitted to jbd2 area successfully. 3. The write-back thread of device help flush the bhs to disk but encounter write error due to abnormal storage link. 4. After a while the storage link become normal. Truncate log flush worker triggered by the next space reclaiming found the dirty bh of truncate log and clear its 'BH_Write_EIO' and then set it uptodate in __ocfs2_journal_access(): ocfs2_truncate_log_worker ocfs2_flush_truncate_log __ocfs2_flush_truncate_log ocfs2_replay_truncate_records ocfs2_journal_access_di __ocfs2_journal_access // here we clear io_error and set 'tl_bh' uptodata. 5. Then jbd2 will flush the bh of truncate log to disk, but the bh of extent rec is still in error state, and unfortunately nobody will take care of it. 6. At last the space of extent rec was not reduced, but truncate log flush worker have given it back to globalalloc. That will cause duplicate cluster problem which could be identified by fsck.ocfs2. Sadly we can hardly revert this but set fs read-only in case of ruining atomicity and consistency of space reclaim. Link: http://lkml.kernel.org/r/5A6E8092.8090701@huawei.com Fixes: acf8fdbe6afb ("ocfs2: do not BUG if buffer not uptodate in __ocfs2_journal_access") Signed-off-by: Jun Piao Reviewed-by: Yiwen Jiang Reviewed-by: Changwei Ge Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/journal.c | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 36304434eacf..e5dcea6cee5f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -666,23 +666,24 @@ static int __ocfs2_journal_access(handle_t *handle, /* we can safely remove this assertion after testing. */ if (!buffer_uptodate(bh)) { mlog(ML_ERROR, "giving me a buffer that's not uptodate!\n"); - mlog(ML_ERROR, "b_blocknr=%llu\n", - (unsigned long long)bh->b_blocknr); + mlog(ML_ERROR, "b_blocknr=%llu, b_state=0x%lx\n", + (unsigned long long)bh->b_blocknr, bh->b_state); lock_buffer(bh); /* - * A previous attempt to write this buffer head failed. - * Nothing we can do but to retry the write and hope for - * the best. + * A previous transaction with a couple of buffer heads fail + * to checkpoint, so all the bhs are marked as BH_Write_EIO. + * For current transaction, the bh is just among those error + * bhs which previous transaction handle. We can't just clear + * its BH_Write_EIO and reuse directly, since other bhs are + * not written to disk yet and that will cause metadata + * inconsistency. So we should set fs read-only to avoid + * further damage. */ if (buffer_write_io_error(bh) && !buffer_uptodate(bh)) { - clear_buffer_write_io_error(bh); - set_buffer_uptodate(bh); - } - - if (!buffer_uptodate(bh)) { unlock_buffer(bh); - return -EIO; + return ocfs2_error(osb->sb, "A previous attempt to " + "write this buffer head failed\n"); } unlock_buffer(bh); } From 692ae74aaf226a557d88d5412a1764c09e63a193 Mon Sep 17 00:00:00 2001 From: Byongho Lee Date: Wed, 31 Jan 2018 16:15:36 -0800 Subject: [PATCH 025/118] mm/slab_common.c: make calculate_alignment() static calculate_alignment() function is only used inside slab_common.c. So make it static and let the compiler do more optimizations. After this patch there's a small improvement in text and data size. $ gcc --version gcc (GCC) 7.2.1 20171128 Before: text data bss dec hex filename 9890457 3828702 1212364 14931523 e3d643 vmlinux After: text data bss dec hex filename 9890437 3828670 1212364 14931471 e3d60f vmlinux Also I fixed a style problem reported by checkpatch. WARNING: Missing a blank line after declarations #53: FILE: mm/slab_common.c:286: + unsigned long ralign = cache_line_size(); + while (size <= ralign / 2) Link: http://lkml.kernel.org/r/20171210080132.406-1-bhlee.kernel@gmail.com Signed-off-by: Byongho Lee Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.h | 3 --- mm/slab_common.c | 56 +++++++++++++++++++++++++----------------------- 2 files changed, 29 insertions(+), 30 deletions(-) diff --git a/mm/slab.h b/mm/slab.h index ad657ffa44e5..e8e2095a6185 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -78,9 +78,6 @@ extern const struct kmalloc_info_struct { unsigned long size; } kmalloc_info[]; -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size); - #ifndef CONFIG_SLOB /* Kmalloc array related functions */ void setup_kmalloc_cache_index_table(void); diff --git a/mm/slab_common.c b/mm/slab_common.c index c8cb36774ba1..deeddf95cdcf 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -267,6 +267,35 @@ static inline void memcg_unlink_cache(struct kmem_cache *s) } #endif /* CONFIG_MEMCG && !CONFIG_SLOB */ +/* + * Figure out what the alignment of the objects will be given a set of + * flags, a user specified alignment and the size of the objects. + */ +static unsigned long calculate_alignment(unsigned long flags, + unsigned long align, unsigned long size) +{ + /* + * If the user wants hardware cache aligned objects then follow that + * suggestion if the object is sufficiently large. + * + * The hardware cache alignment cannot override the specified + * alignment though. If that is greater then use it. + */ + if (flags & SLAB_HWCACHE_ALIGN) { + unsigned long ralign; + + ralign = cache_line_size(); + while (size <= ralign / 2) + ralign /= 2; + align = max(align, ralign); + } + + if (align < ARCH_SLAB_MINALIGN) + align = ARCH_SLAB_MINALIGN; + + return ALIGN(align, sizeof(void *)); +} + /* * Find a mergeable slab cache */ @@ -337,33 +366,6 @@ struct kmem_cache *find_mergeable(size_t size, size_t align, return NULL; } -/* - * Figure out what the alignment of the objects will be given a set of - * flags, a user specified alignment and the size of the objects. - */ -unsigned long calculate_alignment(slab_flags_t flags, - unsigned long align, unsigned long size) -{ - /* - * If the user wants hardware cache aligned objects then follow that - * suggestion if the object is sufficiently large. - * - * The hardware cache alignment cannot override the specified - * alignment though. If that is greater then use it. - */ - if (flags & SLAB_HWCACHE_ALIGN) { - unsigned long ralign = cache_line_size(); - while (size <= ralign / 2) - ralign /= 2; - align = max(align, ralign); - } - - if (align < ARCH_SLAB_MINALIGN) - align = ARCH_SLAB_MINALIGN; - - return ALIGN(align, sizeof(void *)); -} - static struct kmem_cache *create_cache(const char *name, size_t object_size, size_t size, size_t align, slab_flags_t flags, void (*ctor)(void *), From 84ebb5827d015c1045429d018bf9a48f95f082a6 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:15:39 -0800 Subject: [PATCH 026/118] mm/slab.c: remove redundant assignments for slab_state slab_state is being set to "UP" in create_kmalloc_caches(), and later on we set it again in kmem_cache_init_late(), but slab_state does not change in the meantime. Remove the redundant assignment from kmem_cache_init_late(). And unless I overlooked anything, the same goes for "slab_state = FULL". slab_state is set to "FULL" in kmem_cache_init_late(), but it is later being set again in cpucache_init(), which gets called from do_initcall_level(). So remove the assignment from cpucache_init() as well. Link: http://lkml.kernel.org/r/20171215134452.GA1920@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index 4e51ef954026..226906294183 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1316,8 +1316,6 @@ void __init kmem_cache_init_late(void) { struct kmem_cache *cachep; - slab_state = UP; - /* 6) resize the head arrays to their final sizes */ mutex_lock(&slab_mutex); list_for_each_entry(cachep, &slab_caches, list) @@ -1353,8 +1351,6 @@ static int __init cpucache_init(void) slab_online_cpu, slab_offline_cpu); WARN_ON(ret < 0); - /* Done! */ - slab_state = FULL; return 0; } __initcall(cpucache_init); From 5d682681f8a2bd127748d707243661fcb00f7acb Mon Sep 17 00:00:00 2001 From: Balasubramani Vivekanandan Date: Wed, 31 Jan 2018 16:15:43 -0800 Subject: [PATCH 027/118] mm/slub.c: fix wrong address during slab padding restoration Start address calculated for slab padding restoration was wrong. Wrong address would point to some section before padding and could cause corruption Link: http://lkml.kernel.org/r/1516604578-4577-1-git-send-email-balasubramani_vivekanandan@mentor.com Signed-off-by: Balasubramani Vivekanandan Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index cfd56e5a35fb..733ba32c031b 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -838,6 +838,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) u8 *start; u8 *fault; u8 *end; + u8 *pad; int length; int remainder; @@ -851,8 +852,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) if (!remainder) return 1; + pad = end - remainder; metadata_access_enable(); - fault = memchr_inv(end - remainder, POISON_INUSE, remainder); + fault = memchr_inv(pad, POISON_INUSE, remainder); metadata_access_disable(); if (!fault) return 1; @@ -860,9 +862,9 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) end--; slab_err(s, page, "Padding overwritten. 0x%p-0x%p", fault, end - 1); - print_section(KERN_ERR, "Padding ", end - remainder, remainder); + print_section(KERN_ERR, "Padding ", pad, remainder); - restore_bytes(s, "slab padding", POISON_INUSE, end - remainder, end); + restore_bytes(s, "slab padding", POISON_INUSE, fault, end); return 0; } From 0d2d5d40deb49314b6f701589e1cae3bca3aa94c Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 31 Jan 2018 16:15:47 -0800 Subject: [PATCH 028/118] slub: remove obsolete comments of put_cpu_partial() Commit d6e0b7fa1186 ("slub: make dead caches discard free slabs immediately") makes put_cpu_partial() run with preemption disabled and interrupts disabled when calling unfreeze_partials(). The comment: "put_cpu_partial() is done without interrupts disabled and without preemption disabled" looks obsolete, so remove it. Link: http://lkml.kernel.org/r/1516968550-1520-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index 733ba32c031b..693b7074bc53 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2222,9 +2222,7 @@ static void unfreeze_partials(struct kmem_cache *s, /* * Put a page that was just frozen (in __slab_free) into a partial page - * slot if available. This is done without interrupts disabled and without - * preemption disabled. The cmpxchg is racy and may put the partial page - * onto a random cpus partial slot. + * slot if available. * * If we did not find a slot then simply move all the partials to the * per node partial list. From d70f2a14b72a4bc094cf3a92e4794644a7adc590 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 31 Jan 2018 16:15:51 -0800 Subject: [PATCH 029/118] include/linux/sched/mm.h: uninline mmdrop_async(), etc mmdrop_async() is only used in fork.c. Move that and its support functions into fork.c, uninline it all. Quite a lot of code gets moved around to avoid forward declarations. Cc: Ingo Molnar Cc: Michal Hocko Cc: Peter Zijlstra Cc: Oleg Nesterov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 24 +-- kernel/fork.c | 448 +++++++++++++++++++++------------------ 2 files changed, 238 insertions(+), 234 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 3d49b91b674d..bd422561a75e 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -11,7 +11,7 @@ /* * Routines for handling mm_structs */ -extern struct mm_struct * mm_alloc(void); +extern struct mm_struct *mm_alloc(void); /** * mmgrab() - Pin a &struct mm_struct. @@ -35,27 +35,7 @@ static inline void mmgrab(struct mm_struct *mm) atomic_inc(&mm->mm_count); } -/* mmdrop drops the mm and the page tables */ -extern void __mmdrop(struct mm_struct *); -static inline void mmdrop(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) - __mmdrop(mm); -} - -static inline void mmdrop_async_fn(struct work_struct *work) -{ - struct mm_struct *mm = container_of(work, struct mm_struct, async_put_work); - __mmdrop(mm); -} - -static inline void mmdrop_async(struct mm_struct *mm) -{ - if (unlikely(atomic_dec_and_test(&mm->mm_count))) { - INIT_WORK(&mm->async_put_work, mmdrop_async_fn); - schedule_work(&mm->async_put_work); - } -} +extern void mmdrop(struct mm_struct *mm); /** * mmget() - Pin the address space associated with a &struct mm_struct. diff --git a/kernel/fork.c b/kernel/fork.c index 2295fc69717f..5e6cf0dd031c 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -77,6 +77,7 @@ #include #include #include +#include #include #include #include @@ -390,6 +391,241 @@ void free_task(struct task_struct *tsk) } EXPORT_SYMBOL(free_task); +#ifdef CONFIG_MMU +static __latent_entropy int dup_mmap(struct mm_struct *mm, + struct mm_struct *oldmm) +{ + struct vm_area_struct *mpnt, *tmp, *prev, **pprev; + struct rb_node **rb_link, *rb_parent; + int retval; + unsigned long charge; + LIST_HEAD(uf); + + uprobe_start_dup_mmap(); + if (down_write_killable(&oldmm->mmap_sem)) { + retval = -EINTR; + goto fail_uprobe_end; + } + flush_cache_dup_mm(oldmm); + uprobe_dup_mmap(oldmm, mm); + /* + * Not linked in yet - no deadlock potential: + */ + down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); + + /* No ordering required: file already has been exposed. */ + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + + mm->total_vm = oldmm->total_vm; + mm->data_vm = oldmm->data_vm; + mm->exec_vm = oldmm->exec_vm; + mm->stack_vm = oldmm->stack_vm; + + rb_link = &mm->mm_rb.rb_node; + rb_parent = NULL; + pprev = &mm->mmap; + retval = ksm_fork(mm, oldmm); + if (retval) + goto out; + retval = khugepaged_fork(mm, oldmm); + if (retval) + goto out; + + prev = NULL; + for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { + struct file *file; + + if (mpnt->vm_flags & VM_DONTCOPY) { + vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); + continue; + } + charge = 0; + if (mpnt->vm_flags & VM_ACCOUNT) { + unsigned long len = vma_pages(mpnt); + + if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ + goto fail_nomem; + charge = len; + } + tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); + if (!tmp) + goto fail_nomem; + *tmp = *mpnt; + INIT_LIST_HEAD(&tmp->anon_vma_chain); + retval = vma_dup_policy(mpnt, tmp); + if (retval) + goto fail_nomem_policy; + tmp->vm_mm = mm; + retval = dup_userfaultfd(tmp, &uf); + if (retval) + goto fail_nomem_anon_vma_fork; + if (tmp->vm_flags & VM_WIPEONFORK) { + /* VM_WIPEONFORK gets a clean slate in the child. */ + tmp->anon_vma = NULL; + if (anon_vma_prepare(tmp)) + goto fail_nomem_anon_vma_fork; + } else if (anon_vma_fork(tmp, mpnt)) + goto fail_nomem_anon_vma_fork; + tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); + tmp->vm_next = tmp->vm_prev = NULL; + file = tmp->vm_file; + if (file) { + struct inode *inode = file_inode(file); + struct address_space *mapping = file->f_mapping; + + get_file(file); + if (tmp->vm_flags & VM_DENYWRITE) + atomic_dec(&inode->i_writecount); + i_mmap_lock_write(mapping); + if (tmp->vm_flags & VM_SHARED) + atomic_inc(&mapping->i_mmap_writable); + flush_dcache_mmap_lock(mapping); + /* insert tmp into the share list, just after mpnt */ + vma_interval_tree_insert_after(tmp, mpnt, + &mapping->i_mmap); + flush_dcache_mmap_unlock(mapping); + i_mmap_unlock_write(mapping); + } + + /* + * Clear hugetlb-related page reserves for children. This only + * affects MAP_PRIVATE mappings. Faults generated by the child + * are not guaranteed to succeed, even if read-only + */ + if (is_vm_hugetlb_page(tmp)) + reset_vma_resv_huge_pages(tmp); + + /* + * Link in the new vma and copy the page table entries. + */ + *pprev = tmp; + pprev = &tmp->vm_next; + tmp->vm_prev = prev; + prev = tmp; + + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; + + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); + + if (tmp->vm_ops && tmp->vm_ops->open) + tmp->vm_ops->open(tmp); + + if (retval) + goto out; + } + /* a new mm has just been created */ + arch_dup_mmap(oldmm, mm); + retval = 0; +out: + up_write(&mm->mmap_sem); + flush_tlb_mm(oldmm); + up_write(&oldmm->mmap_sem); + dup_userfaultfd_complete(&uf); +fail_uprobe_end: + uprobe_end_dup_mmap(); + return retval; +fail_nomem_anon_vma_fork: + mpol_put(vma_policy(tmp)); +fail_nomem_policy: + kmem_cache_free(vm_area_cachep, tmp); +fail_nomem: + retval = -ENOMEM; + vm_unacct_memory(charge); + goto out; +} + +static inline int mm_alloc_pgd(struct mm_struct *mm) +{ + mm->pgd = pgd_alloc(mm); + if (unlikely(!mm->pgd)) + return -ENOMEM; + return 0; +} + +static inline void mm_free_pgd(struct mm_struct *mm) +{ + pgd_free(mm, mm->pgd); +} +#else +static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) +{ + down_write(&oldmm->mmap_sem); + RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); + up_write(&oldmm->mmap_sem); + return 0; +} +#define mm_alloc_pgd(mm) (0) +#define mm_free_pgd(mm) +#endif /* CONFIG_MMU */ + +static void check_mm(struct mm_struct *mm) +{ + int i; + + for (i = 0; i < NR_MM_COUNTERS; i++) { + long x = atomic_long_read(&mm->rss_stat.count[i]); + + if (unlikely(x)) + printk(KERN_ALERT "BUG: Bad rss-counter state " + "mm:%p idx:%d val:%ld\n", mm, i, x); + } + + if (mm_pgtables_bytes(mm)) + pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", + mm_pgtables_bytes(mm)); + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS + VM_BUG_ON_MM(mm->pmd_huge_pte, mm); +#endif +} + +#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) +#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) + +/* + * Called when the last reference to the mm + * is dropped: either by a lazy thread or by + * mmput. Free the page directory and the mm. + */ +static void __mmdrop(struct mm_struct *mm) +{ + BUG_ON(mm == &init_mm); + mm_free_pgd(mm); + destroy_context(mm); + hmm_mm_destroy(mm); + mmu_notifier_mm_destroy(mm); + check_mm(mm); + put_user_ns(mm->user_ns); + free_mm(mm); +} + +void mmdrop(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) + __mmdrop(mm); +} +EXPORT_SYMBOL_GPL(mmdrop); + +static void mmdrop_async_fn(struct work_struct *work) +{ + struct mm_struct *mm; + + mm = container_of(work, struct mm_struct, async_put_work); + __mmdrop(mm); +} + +static void mmdrop_async(struct mm_struct *mm) +{ + if (unlikely(atomic_dec_and_test(&mm->mm_count))) { + INIT_WORK(&mm->async_put_work, mmdrop_async_fn); + schedule_work(&mm->async_put_work); + } +} + static inline void free_signal_struct(struct signal_struct *sig) { taskstats_tgid_free(sig); @@ -594,181 +830,8 @@ free_tsk: return NULL; } -#ifdef CONFIG_MMU -static __latent_entropy int dup_mmap(struct mm_struct *mm, - struct mm_struct *oldmm) -{ - struct vm_area_struct *mpnt, *tmp, *prev, **pprev; - struct rb_node **rb_link, *rb_parent; - int retval; - unsigned long charge; - LIST_HEAD(uf); - - uprobe_start_dup_mmap(); - if (down_write_killable(&oldmm->mmap_sem)) { - retval = -EINTR; - goto fail_uprobe_end; - } - flush_cache_dup_mm(oldmm); - uprobe_dup_mmap(oldmm, mm); - /* - * Not linked in yet - no deadlock potential: - */ - down_write_nested(&mm->mmap_sem, SINGLE_DEPTH_NESTING); - - /* No ordering required: file already has been exposed. */ - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - - mm->total_vm = oldmm->total_vm; - mm->data_vm = oldmm->data_vm; - mm->exec_vm = oldmm->exec_vm; - mm->stack_vm = oldmm->stack_vm; - - rb_link = &mm->mm_rb.rb_node; - rb_parent = NULL; - pprev = &mm->mmap; - retval = ksm_fork(mm, oldmm); - if (retval) - goto out; - retval = khugepaged_fork(mm, oldmm); - if (retval) - goto out; - - prev = NULL; - for (mpnt = oldmm->mmap; mpnt; mpnt = mpnt->vm_next) { - struct file *file; - - if (mpnt->vm_flags & VM_DONTCOPY) { - vm_stat_account(mm, mpnt->vm_flags, -vma_pages(mpnt)); - continue; - } - charge = 0; - if (mpnt->vm_flags & VM_ACCOUNT) { - unsigned long len = vma_pages(mpnt); - - if (security_vm_enough_memory_mm(oldmm, len)) /* sic */ - goto fail_nomem; - charge = len; - } - tmp = kmem_cache_alloc(vm_area_cachep, GFP_KERNEL); - if (!tmp) - goto fail_nomem; - *tmp = *mpnt; - INIT_LIST_HEAD(&tmp->anon_vma_chain); - retval = vma_dup_policy(mpnt, tmp); - if (retval) - goto fail_nomem_policy; - tmp->vm_mm = mm; - retval = dup_userfaultfd(tmp, &uf); - if (retval) - goto fail_nomem_anon_vma_fork; - if (tmp->vm_flags & VM_WIPEONFORK) { - /* VM_WIPEONFORK gets a clean slate in the child. */ - tmp->anon_vma = NULL; - if (anon_vma_prepare(tmp)) - goto fail_nomem_anon_vma_fork; - } else if (anon_vma_fork(tmp, mpnt)) - goto fail_nomem_anon_vma_fork; - tmp->vm_flags &= ~(VM_LOCKED | VM_LOCKONFAULT); - tmp->vm_next = tmp->vm_prev = NULL; - file = tmp->vm_file; - if (file) { - struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; - - get_file(file); - if (tmp->vm_flags & VM_DENYWRITE) - atomic_dec(&inode->i_writecount); - i_mmap_lock_write(mapping); - if (tmp->vm_flags & VM_SHARED) - atomic_inc(&mapping->i_mmap_writable); - flush_dcache_mmap_lock(mapping); - /* insert tmp into the share list, just after mpnt */ - vma_interval_tree_insert_after(tmp, mpnt, - &mapping->i_mmap); - flush_dcache_mmap_unlock(mapping); - i_mmap_unlock_write(mapping); - } - - /* - * Clear hugetlb-related page reserves for children. This only - * affects MAP_PRIVATE mappings. Faults generated by the child - * are not guaranteed to succeed, even if read-only - */ - if (is_vm_hugetlb_page(tmp)) - reset_vma_resv_huge_pages(tmp); - - /* - * Link in the new vma and copy the page table entries. - */ - *pprev = tmp; - pprev = &tmp->vm_next; - tmp->vm_prev = prev; - prev = tmp; - - __vma_link_rb(mm, tmp, rb_link, rb_parent); - rb_link = &tmp->vm_rb.rb_right; - rb_parent = &tmp->vm_rb; - - mm->map_count++; - if (!(tmp->vm_flags & VM_WIPEONFORK)) - retval = copy_page_range(mm, oldmm, mpnt); - - if (tmp->vm_ops && tmp->vm_ops->open) - tmp->vm_ops->open(tmp); - - if (retval) - goto out; - } - /* a new mm has just been created */ - retval = arch_dup_mmap(oldmm, mm); -out: - up_write(&mm->mmap_sem); - flush_tlb_mm(oldmm); - up_write(&oldmm->mmap_sem); - dup_userfaultfd_complete(&uf); -fail_uprobe_end: - uprobe_end_dup_mmap(); - return retval; -fail_nomem_anon_vma_fork: - mpol_put(vma_policy(tmp)); -fail_nomem_policy: - kmem_cache_free(vm_area_cachep, tmp); -fail_nomem: - retval = -ENOMEM; - vm_unacct_memory(charge); - goto out; -} - -static inline int mm_alloc_pgd(struct mm_struct *mm) -{ - mm->pgd = pgd_alloc(mm); - if (unlikely(!mm->pgd)) - return -ENOMEM; - return 0; -} - -static inline void mm_free_pgd(struct mm_struct *mm) -{ - pgd_free(mm, mm->pgd); -} -#else -static int dup_mmap(struct mm_struct *mm, struct mm_struct *oldmm) -{ - down_write(&oldmm->mmap_sem); - RCU_INIT_POINTER(mm->exe_file, get_mm_exe_file(oldmm)); - up_write(&oldmm->mmap_sem); - return 0; -} -#define mm_alloc_pgd(mm) (0) -#define mm_free_pgd(mm) -#endif /* CONFIG_MMU */ - __cacheline_aligned_in_smp DEFINE_SPINLOCK(mmlist_lock); -#define allocate_mm() (kmem_cache_alloc(mm_cachep, GFP_KERNEL)) -#define free_mm(mm) (kmem_cache_free(mm_cachep, (mm))) - static unsigned long default_dump_filter = MMF_DUMP_FILTER_DEFAULT; static int __init coredump_filter_setup(char *s) @@ -858,27 +921,6 @@ fail_nopgd: return NULL; } -static void check_mm(struct mm_struct *mm) -{ - int i; - - for (i = 0; i < NR_MM_COUNTERS; i++) { - long x = atomic_long_read(&mm->rss_stat.count[i]); - - if (unlikely(x)) - printk(KERN_ALERT "BUG: Bad rss-counter state " - "mm:%p idx:%d val:%ld\n", mm, i, x); - } - - if (mm_pgtables_bytes(mm)) - pr_alert("BUG: non-zero pgtables_bytes on freeing mm: %ld\n", - mm_pgtables_bytes(mm)); - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS - VM_BUG_ON_MM(mm->pmd_huge_pte, mm); -#endif -} - /* * Allocate and initialize an mm_struct. */ @@ -894,24 +936,6 @@ struct mm_struct *mm_alloc(void) return mm_init(mm, current, current_user_ns()); } -/* - * Called when the last reference to the mm - * is dropped: either by a lazy thread or by - * mmput. Free the page directory and the mm. - */ -void __mmdrop(struct mm_struct *mm) -{ - BUG_ON(mm == &init_mm); - mm_free_pgd(mm); - destroy_context(mm); - hmm_mm_destroy(mm); - mmu_notifier_mm_destroy(mm); - check_mm(mm); - put_user_ns(mm->user_ns); - free_mm(mm); -} -EXPORT_SYMBOL_GPL(__mmdrop); - static inline void __mmput(struct mm_struct *mm) { VM_BUG_ON(atomic_read(&mm->mm_users)); From 4a01768e9e91082efc9a6384b1ef579fdcbce828 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:15:55 -0800 Subject: [PATCH 030/118] mm: kmemleak: remove unused hardirq.h Preempt counter APIs have been split out, currently, hardirq.h just includes irq_enter/exit APIs which are not used by kmemleak at all. So, remove the unused hardirq.h. Link: http://lkml.kernel.org/r/1510959741-31109-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Cc: Michal Hocko Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/kmemleak.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/kmemleak.c b/mm/kmemleak.c index f656ca27f6c2..e83987c55a08 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -91,7 +91,6 @@ #include #include #include -#include #include #include #include From a85f878b443f8d2b91ba76f09da21ac0af22e07f Mon Sep 17 00:00:00 2001 From: Srividya Desireddy Date: Wed, 31 Jan 2018 16:15:59 -0800 Subject: [PATCH 031/118] zswap: same-filled pages handling Zswap is a cache which compresses the pages that are being swapped out and stores them into a dynamically allocated RAM-based memory pool. Experiments have shown that around 10-20% of pages stored in zswap are same-filled pages (i.e. contents of the page are all same), but these pages are handled as normal pages by compressing and allocating memory in the pool. This patch adds a check in zswap_frontswap_store() to identify same-filled page before compression of the page. If the page is a same-filled page, set zswap_entry.length to zero, save the same-filled value and skip the compression of the page and alloction of memory in zpool. In zswap_frontswap_load(), check if value of zswap_entry.length is zero corresponding to the page to be loaded. If zswap_entry.length is zero, fill the page with same-filled value. This saves the decompression time during load. On a ARM Quad Core 32-bit device with 1.5GB RAM by launching and relaunching different applications, out of ~64000 pages stored in zswap, ~11000 pages were same-value filled pages (including zero-filled pages) and ~9000 pages were zero-filled pages. An average of 17% of pages(including zero-filled pages) in zswap are same-value filled pages and 14% pages are zero-filled pages. An average of 3% of pages are same-filled non-zero pages. The below table shows the execution time profiling with the patch. Baseline With patch % Improvement ----------------------------------------------------------------- *Zswap Store Time 26.5ms 18ms 32% (of same value pages) *Zswap Load Time (of same value pages) 25.5ms 13ms 49% ----------------------------------------------------------------- On Ubuntu PC with 2GB RAM, while executing kernel build and other test scripts and running multimedia applications, out of 360000 pages stored in zswap 78000(~22%) of pages were found to be same-value filled pages (including zero-filled pages) and 64000(~17%) are zero-filled pages. So an average of %5 of pages are same-filled non-zero pages. The below table shows the execution time profiling with the patch. Baseline With patch % Improvement ----------------------------------------------------------------- *Zswap Store Time 91ms 74ms 19% (of same value pages) *Zswap Load Time 50ms 7.5ms 85% (of same value pages) ----------------------------------------------------------------- *The execution times may vary with test device used. Dan said: : I did test this patch out this week, and I added some instrumentation to : check the performance impact, and tested with a small program to try to : check the best and worst cases. : : When doing a lot of swap where all (or almost all) pages are same-value, I : found this patch does save both time and space, significantly. The exact : improvement in time and space depends on which compressor is being used, : but roughly agrees with the numbers you listed. : : In the worst case situation, where all (or almost all) pages have the : same-value *except* the final long (meaning, zswap will check each long on : the entire page but then still have to pass the page to the compressor), : the same-value check is around 10-15% of the total time spent in : zswap_frontswap_store(). That's a not-insignificant amount of time, but : it's not huge. Considering that most systems will probably be swapping : pages that aren't similar to the worst case (although I don't have any : data to know that), I'd say the improvement is worth the possible : worst-case performance impact. [srividya.dr@samsung.com: add memset_l instead of for loop] Link: http://lkml.kernel.org/r/20171018104832epcms5p1b2232e2236258de3d03d1344dde9fce0@epcms5p1 Signed-off-by: Srividya Desireddy Acked-by: Dan Streetman Cc: Seth Jennings Cc: Pekka Enberg Cc: Dinakar Reddy Pathireddy Cc: SHARAN ALLUR Cc: RAJIB BASU Cc: JUHUN KIM Cc: Matthew Wilcox Cc: Timofey Titovets Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zswap.c | 71 ++++++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 66 insertions(+), 5 deletions(-) diff --git a/mm/zswap.c b/mm/zswap.c index d39581a076c3..1133b4ceb72e 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -49,6 +49,8 @@ static u64 zswap_pool_total_size; /* The number of compressed pages currently stored in zswap */ static atomic_t zswap_stored_pages = ATOMIC_INIT(0); +/* The number of same-value filled pages currently stored in zswap */ +static atomic_t zswap_same_filled_pages = ATOMIC_INIT(0); /* * The statistics below are not protected from concurrent access for @@ -116,6 +118,11 @@ module_param_cb(zpool, &zswap_zpool_param_ops, &zswap_zpool_type, 0644); static unsigned int zswap_max_pool_percent = 20; module_param_named(max_pool_percent, zswap_max_pool_percent, uint, 0644); +/* Enable/disable handling same-value filled pages (enabled by default) */ +static bool zswap_same_filled_pages_enabled = true; +module_param_named(same_filled_pages_enabled, zswap_same_filled_pages_enabled, + bool, 0644); + /********************************* * data structures **********************************/ @@ -145,9 +152,10 @@ struct zswap_pool { * be held while changing the refcount. Since the lock must * be held, there is no reason to also make refcount atomic. * length - the length in bytes of the compressed page data. Needed during - * decompression + * decompression. For a same value filled page length is 0. * pool - the zswap_pool the entry's data is in * handle - zpool allocation handle that stores the compressed page data + * value - value of the same-value filled pages which have same content */ struct zswap_entry { struct rb_node rbnode; @@ -155,7 +163,10 @@ struct zswap_entry { int refcount; unsigned int length; struct zswap_pool *pool; - unsigned long handle; + union { + unsigned long handle; + unsigned long value; + }; }; struct zswap_header { @@ -320,8 +331,12 @@ static void zswap_rb_erase(struct rb_root *root, struct zswap_entry *entry) */ static void zswap_free_entry(struct zswap_entry *entry) { - zpool_free(entry->pool->zpool, entry->handle); - zswap_pool_put(entry->pool); + if (!entry->length) + atomic_dec(&zswap_same_filled_pages); + else { + zpool_free(entry->pool->zpool, entry->handle); + zswap_pool_put(entry->pool); + } zswap_entry_cache_free(entry); atomic_dec(&zswap_stored_pages); zswap_update_total_size(); @@ -953,6 +968,28 @@ static int zswap_shrink(void) return ret; } +static int zswap_is_page_same_filled(void *ptr, unsigned long *value) +{ + unsigned int pos; + unsigned long *page; + + page = (unsigned long *)ptr; + for (pos = 1; pos < PAGE_SIZE / sizeof(*page); pos++) { + if (page[pos] != page[0]) + return 0; + } + *value = page[0]; + return 1; +} + +static void zswap_fill_page(void *ptr, unsigned long value) +{ + unsigned long *page; + + page = (unsigned long *)ptr; + memset_l(page, value, PAGE_SIZE / sizeof(unsigned long)); +} + /********************************* * frontswap hooks **********************************/ @@ -965,7 +1002,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct crypto_comp *tfm; int ret; unsigned int dlen = PAGE_SIZE, len; - unsigned long handle; + unsigned long handle, value; char *buf; u8 *src, *dst; struct zswap_header *zhdr; @@ -993,6 +1030,19 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, goto reject; } + if (zswap_same_filled_pages_enabled) { + src = kmap_atomic(page); + if (zswap_is_page_same_filled(src, &value)) { + kunmap_atomic(src); + entry->offset = offset; + entry->length = 0; + entry->value = value; + atomic_inc(&zswap_same_filled_pages); + goto insert_entry; + } + kunmap_atomic(src); + } + /* if entry is successfully added, it keeps the reference */ entry->pool = zswap_pool_current_get(); if (!entry->pool) { @@ -1037,6 +1087,7 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, entry->handle = handle; entry->length = dlen; +insert_entry: /* map */ spin_lock(&tree->lock); do { @@ -1089,6 +1140,13 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, } spin_unlock(&tree->lock); + if (!entry->length) { + dst = kmap_atomic(page); + zswap_fill_page(dst, entry->value); + kunmap_atomic(dst); + goto freeentry; + } + /* decompress */ dlen = PAGE_SIZE; src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, @@ -1101,6 +1159,7 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, zpool_unmap_handle(entry->pool->zpool, entry->handle); BUG_ON(ret); +freeentry: spin_lock(&tree->lock); zswap_entry_put(tree, entry); spin_unlock(&tree->lock); @@ -1209,6 +1268,8 @@ static int __init zswap_debugfs_init(void) zswap_debugfs_root, &zswap_pool_total_size); debugfs_create_atomic_t("stored_pages", S_IRUGO, zswap_debugfs_root, &zswap_stored_pages); + debugfs_create_atomic_t("same_filled_pages", 0444, + zswap_debugfs_root, &zswap_same_filled_pages); return 0; } From 2e3ca40f03bb13709df40eff2f7fc157803fa5a3 Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 Jan 2018 16:16:02 -0800 Subject: [PATCH 032/118] mm: relax deferred struct page requirements There is no need to have ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT, as all the page initialization code is in common code. Also, there is no need to depend on MEMORY_HOTPLUG, as initialization code does not really use hotplug memory functionality. So, we can remove this requirement as well. This patch allows to use deferred struct page initialization on all platforms with memblock allocator. Tested on x86, arm64, and sparc. Also, verified that code compiles on PPC with CONFIG_MEMORY_HOTPLUG disabled. Link: http://lkml.kernel.org/r/20171117014601.31606-1-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Heiko Carstens [s390] Reviewed-by: Khalid Aziz Acked-by: Michael Ellerman Acked-by: Michal Hocko Cc: Steven Sistare Cc: Daniel Jordan Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Kirill A. Shutemov Cc: Reza Arbab Cc: Martin Schwidefsky Cc: Thomas Gleixner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/Kconfig | 1 - arch/s390/Kconfig | 1 - arch/x86/Kconfig | 1 - mm/Kconfig | 7 +------ 4 files changed, 1 insertion(+), 9 deletions(-) diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index e92432ae9737..73fcf592ee91 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -151,7 +151,6 @@ config PPC select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_WANT_IPC_PARSE_VERSION diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index 9376637229c9..0105ce28e246 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -108,7 +108,6 @@ config S390 select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE select ARCH_SAVE_PAGE_KEYS if HIBERNATION select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index dbe5542a6666..7a1c51198af1 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -69,7 +69,6 @@ config X86 select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_SUPPORTS_ATOMIC_RMW - select ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS diff --git a/mm/Kconfig b/mm/Kconfig index 03ff7703d322..c782e8fb7235 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -639,15 +639,10 @@ config MAX_STACK_SIZE_MB A sane initial value is 80 MB. -# For architectures that support deferred memory initialisation -config ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - bool - config DEFERRED_STRUCT_PAGE_INIT bool "Defer initialisation of struct pages to kthreads" default n - depends on ARCH_SUPPORTS_DEFERRED_STRUCT_PAGE_INIT - depends on NO_BOOTMEM && MEMORY_HOTPLUG + depends on NO_BOOTMEM depends on !FLATMEM help Ordinarily all struct pages are initialised during early boot in a From 66f308ed7dab1b3460d186a794e1f9c2d229f709 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:16:07 -0800 Subject: [PATCH 033/118] mm/mempolicy: remove redundant check in get_nodes We have already checked whether maxnode is a page worth of bits, by: maxnode > PAGE_SIZE*BITS_PER_BYTE So no need to check it once more. Link: http://lkml.kernel.org/r/1510882624-44342-2-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Vlastimil Babka Acked-by: David Rientjes Cc: Ingo Molnar Cc: David Rientjes Cc: Naoya Horiguchi Cc: Chris Salls Cc: Andi Kleen Cc: Christopher Lameter Cc: Tan Xiaojun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 4ce44d3ff03d..6e867a8dcca9 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1282,8 +1282,6 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, /* When the user specified more nodes than supported just check if the non supported part is all zero. */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { - if (nlongs > PAGE_SIZE/sizeof(long)) - return -EINVAL; for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { unsigned long t; if (get_user(t, nmask + k)) From 56521e7a02b7b84a5e72691a1fb15570e6055545 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:16:11 -0800 Subject: [PATCH 034/118] mm/mempolicy: fix the check of nodemask from user As Xiaojun reported the ltp of migrate_pages01 will fail on arm64 system which has 4 nodes[0...3], all have memory and CONFIG_NODES_SHIFT=2: migrate_pages01 0 TINFO : test_invalid_nodes migrate_pages01 14 TFAIL : migrate_pages_common.c:45: unexpected failure - returned value = 0, expected: -1 migrate_pages01 15 TFAIL : migrate_pages_common.c:55: call succeeded unexpectedly In this case the test_invalid_nodes of migrate_pages01 will call: SYSC_migrate_pages as: migrate_pages(0, , {0x0000000000000001}, 64, , {0x0000000000000010}, 64) = 0 The new nodes specifies one or more node IDs that are greater than the maximum supported node ID, however, the errno is not set to EINVAL as expected. As man pages of set_mempolicy[1], mbind[2], and migrate_pages[3] mentioned, when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, the errno should set to EINVAL. However, get_nodes only check whether the part of bits [BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES), maxnode) is zero or not, and remain [MAX_NUMNODES, BITS_PER_LONG*BITS_TO_LONGS(MAX_NUMNODES) unchecked. This patch is to check the bits of [MAX_NUMNODES, maxnode) in get_nodes to let migrate_pages set the errno to EINVAL when nodemask specifies one or more node IDs that are greater than the maximum supported node ID, which follows the manpage's guide. [1] http://man7.org/linux/man-pages/man2/set_mempolicy.2.html [2] http://man7.org/linux/man-pages/man2/mbind.2.html [3] http://man7.org/linux/man-pages/man2/migrate_pages.2.html Link: http://lkml.kernel.org/r/1510882624-44342-3-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Reported-by: Tan Xiaojun Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Chris Salls Cc: Christopher Lameter Cc: David Rientjes Cc: Ingo Molnar Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 6e867a8dcca9..65df28d7cc89 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1263,6 +1263,7 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, unsigned long maxnode) { unsigned long k; + unsigned long t; unsigned long nlongs; unsigned long endmask; @@ -1279,11 +1280,17 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, else endmask = (1UL << (maxnode % BITS_PER_LONG)) - 1; - /* When the user specified more nodes than supported just check - if the non supported part is all zero. */ + /* + * When the user specified more nodes than supported just check + * if the non supported part is all zero. + * + * If maxnode have more longs than MAX_NUMNODES, check + * the bits in that area first. And then go through to + * check the rest bits which equal or bigger than MAX_NUMNODES. + * Otherwise, just check bits [MAX_NUMNODES, maxnode). + */ if (nlongs > BITS_TO_LONGS(MAX_NUMNODES)) { for (k = BITS_TO_LONGS(MAX_NUMNODES); k < nlongs; k++) { - unsigned long t; if (get_user(t, nmask + k)) return -EFAULT; if (k == nlongs - 1) { @@ -1296,6 +1303,16 @@ static int get_nodes(nodemask_t *nodes, const unsigned long __user *nmask, endmask = ~0UL; } + if (maxnode > MAX_NUMNODES && MAX_NUMNODES % BITS_PER_LONG != 0) { + unsigned long valid_mask = endmask; + + valid_mask &= ~((1UL << (MAX_NUMNODES % BITS_PER_LONG)) - 1); + if (get_user(t, nmask + nlongs - 1)) + return -EFAULT; + if (t & valid_mask) + return -EINVAL; + } + if (copy_from_user(nodes_addr(*nodes), nmask, nlongs*sizeof(unsigned long))) return -EFAULT; nodes_addr(*nodes)[nlongs-1] &= endmask; From 0486a38bcc4749808edbc848f1bcf232042770fc Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:16:15 -0800 Subject: [PATCH 035/118] mm/mempolicy: add nodes_empty check in SYSC_migrate_pages As in manpage of migrate_pages, the errno should be set to EINVAL when none of the node IDs specified by new_nodes are on-line and allowed by the process's current cpuset context, or none of the specified nodes contain memory. However, when test by following case: new_nodes = 0; old_nodes = 0xf; ret = migrate_pages(pid, old_nodes, new_nodes, MAX); The ret will be 0 and no errno is set. As the new_nodes is empty, we should expect EINVAL as documented. To fix the case like above, this patch check whether target nodes AND current task_nodes is empty, and then check whether AND node_states[N_MEMORY] is empty. Link: http://lkml.kernel.org/r/1510882624-44342-4-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Vlastimil Babka Cc: Andi Kleen Cc: Chris Salls Cc: Christopher Lameter Cc: David Rientjes Cc: Ingo Molnar Cc: Naoya Horiguchi Cc: Tan Xiaojun Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mempolicy.c | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 65df28d7cc89..f604b22ebb65 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1433,10 +1433,14 @@ SYSCALL_DEFINE4(migrate_pages, pid_t, pid, unsigned long, maxnode, goto out_put; } - if (!nodes_subset(*new, node_states[N_MEMORY])) { - err = -EINVAL; + task_nodes = cpuset_mems_allowed(current); + nodes_and(*new, *new, task_nodes); + if (nodes_empty(*new)) + goto out_put; + + nodes_and(*new, *new, node_states[N_MEMORY]); + if (nodes_empty(*new)) goto out_put; - } err = security_task_movememory(task); if (err) From 9852a7212324fd25f896932f4f4607ce47b0a22f Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:16:19 -0800 Subject: [PATCH 036/118] mm: drop hotplug lock from lru_add_drain_all() Pulling cpu hotplug locks inside the mm core function like lru_add_drain_all just asks for problems and the recent lockdep splat [1] just proves this. While the usage in that particular case might be wrong we should avoid the locking as lru_add_drain_all() is used in many places. It seems that this is not all that hard to achieve actually. We have done the same thing for drain_all_pages which is analogous by commit a459eeb7b852 ("mm, page_alloc: do not depend on cpu hotplug locks inside the allocator"). All we have to care about is to handle - the work item might be executed on a different cpu in worker from unbound pool so it doesn't run on pinned on the cpu - we have to make sure that we do not race with page_alloc_cpu_dead calling lru_add_drain_cpu the first part is already handled because the worker calls lru_add_drain which disables preemption when calling lru_add_drain_cpu on the local cpu it is draining. The later is true because page_alloc_cpu_dead is called on the controlling CPU after the hotplugged CPU vanished completely. [1] http://lkml.kernel.org/r/089e0825eec8955c1f055c83d476@google.com [add a cpu hotplug locking interaction as per tglx] Link: http://lkml.kernel.org/r/20171116120535.23765-1-mhocko@kernel.org Signed-off-by: Michal Hocko Acked-by: Thomas Gleixner Cc: Tejun Heo Cc: Peter Zijlstra Cc: Johannes Weiner Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - mm/memory_hotplug.c | 2 +- mm/swap.c | 16 ++++++++-------- 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index c2b8128799c1..0bd4c25016f9 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -332,7 +332,6 @@ extern void mark_page_accessed(struct page *); extern void lru_add_drain(void); extern void lru_add_drain_cpu(int cpu); extern void lru_add_drain_all(void); -extern void lru_add_drain_all_cpuslocked(void); extern void rotate_reclaimable_page(struct page *page); extern void deactivate_file_page(struct page *page); extern void mark_page_lazyfree(struct page *page); diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index c52aa05b106c..999ce3af809d 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1637,7 +1637,7 @@ repeat: goto failed_removal; cond_resched(); - lru_add_drain_all_cpuslocked(); + lru_add_drain_all(); drain_all_pages(zone); pfn = scan_movable_pages(start_pfn, end_pfn); diff --git a/mm/swap.c b/mm/swap.c index 38e1b6374a97..e824c800adca 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -688,7 +688,14 @@ static void lru_add_drain_per_cpu(struct work_struct *dummy) static DEFINE_PER_CPU(struct work_struct, lru_add_drain_work); -void lru_add_drain_all_cpuslocked(void) +/* + * Doesn't need any cpu hotplug locking because we do rely on per-cpu + * kworkers being shut down before our page_alloc_cpu_dead callback is + * executed on the offlined cpu. + * Calling this function with cpu hotplug locks held can actually lead + * to obscure indirect dependencies via WQ context. + */ +void lru_add_drain_all(void) { static DEFINE_MUTEX(lock); static struct cpumask has_work; @@ -724,13 +731,6 @@ void lru_add_drain_all_cpuslocked(void) mutex_unlock(&lock); } -void lru_add_drain_all(void) -{ - get_online_cpus(); - lru_add_drain_all_cpuslocked(); - put_online_cpus(); -} - /** * release_pages - batched put_page() * @pages: array of pages to release From fcb2b0c577f145c7616843c9d4dcb4f9e5d88e29 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Wed, 31 Jan 2018 16:16:22 -0800 Subject: [PATCH 037/118] mm: show total hugetlb memory consumption in /proc/meminfo Currently we display some hugepage statistics (total, free, etc) in /proc/meminfo, but only for default hugepage size (e.g. 2Mb). If hugepages of different sizes are used (like 2Mb and 1Gb on x86-64), /proc/meminfo output can be confusing, as non-default sized hugepages are not reflected at all, and there are no signs that they are existing and consuming system memory. To solve this problem, let's display the total amount of memory, consumed by hugetlb pages of all sized (both free and used). Let's call it "Hugetlb", and display size in kB to match generic /proc/meminfo style. For example, (1024 2Mb pages and 2 1Gb pages are pre-allocated): $ cat /proc/meminfo MemTotal: 8168984 kB MemFree: 3789276 kB <...> CmaFree: 0 kB HugePages_Total: 1024 HugePages_Free: 1024 HugePages_Rsvd: 0 HugePages_Surp: 0 Hugepagesize: 2048 kB Hugetlb: 4194304 kB DirectMap4k: 32632 kB DirectMap2M: 4161536 kB DirectMap1G: 6291456 kB Also, this patch updates corresponding docs to reflect Hugetlb entry meaning and difference between Hugetlb and HugePages_Total * Hugepagesize. Link: http://lkml.kernel.org/r/20171115231409.12131-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Michal Hocko Acked-by: Johannes Weiner Acked-by: David Rientjes Cc: Mike Kravetz Cc: "Aneesh Kumar K.V" Cc: Andrea Arcangeli Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/vm/hugetlbpage.txt | 27 ++++++++++++++++-------- mm/hugetlb.c | 36 +++++++++++++++++++++----------- 2 files changed, 42 insertions(+), 21 deletions(-) diff --git a/Documentation/vm/hugetlbpage.txt b/Documentation/vm/hugetlbpage.txt index 59cbc803aad6..faf077d50d42 100644 --- a/Documentation/vm/hugetlbpage.txt +++ b/Documentation/vm/hugetlbpage.txt @@ -20,19 +20,20 @@ options. The /proc/meminfo file provides information about the total number of persistent hugetlb pages in the kernel's huge page pool. It also displays -information about the number of free, reserved and surplus huge pages and the -default huge page size. The huge page size is needed for generating the -proper alignment and size of the arguments to system calls that map huge page -regions. +default huge page size and information about the number of free, reserved +and surplus huge pages in the pool of huge pages of default size. +The huge page size is needed for generating the proper alignment and +size of the arguments to system calls that map huge page regions. The output of "cat /proc/meminfo" will include lines like: ..... -HugePages_Total: vvv -HugePages_Free: www -HugePages_Rsvd: xxx -HugePages_Surp: yyy -Hugepagesize: zzz kB +HugePages_Total: uuu +HugePages_Free: vvv +HugePages_Rsvd: www +HugePages_Surp: xxx +Hugepagesize: yyy kB +Hugetlb: zzz kB where: HugePages_Total is the size of the pool of huge pages. @@ -47,6 +48,14 @@ HugePages_Surp is short for "surplus," and is the number of huge pages in the pool above the value in /proc/sys/vm/nr_hugepages. The maximum number of surplus huge pages is controlled by /proc/sys/vm/nr_overcommit_hugepages. +Hugepagesize is the default hugepage size (in Kb). +Hugetlb is the total amount of memory (in kB), consumed by huge + pages of all sizes. + If huge pages of different sizes are in use, this number + will exceed HugePages_Total * Hugepagesize. To get more + detailed information, please, refer to + /sys/kernel/mm/hugepages (described below). + /proc/filesystems should also show a filesystem of type "hugetlbfs" configured in the kernel. diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 9a334f5fb730..1e6a5ad0d420 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2975,20 +2975,32 @@ out: void hugetlb_report_meminfo(struct seq_file *m) { - struct hstate *h = &default_hstate; + struct hstate *h; + unsigned long total = 0; + if (!hugepages_supported()) return; - seq_printf(m, - "HugePages_Total: %5lu\n" - "HugePages_Free: %5lu\n" - "HugePages_Rsvd: %5lu\n" - "HugePages_Surp: %5lu\n" - "Hugepagesize: %8lu kB\n", - h->nr_huge_pages, - h->free_huge_pages, - h->resv_huge_pages, - h->surplus_huge_pages, - 1UL << (huge_page_order(h) + PAGE_SHIFT - 10)); + + for_each_hstate(h) { + unsigned long count = h->nr_huge_pages; + + total += (PAGE_SIZE << huge_page_order(h)) * count; + + if (h == &default_hstate) + seq_printf(m, + "HugePages_Total: %5lu\n" + "HugePages_Free: %5lu\n" + "HugePages_Rsvd: %5lu\n" + "HugePages_Surp: %5lu\n" + "Hugepagesize: %8lu kB\n", + count, + h->free_huge_pages, + h->resv_huge_pages, + h->surplus_huge_pages, + (PAGE_SIZE << huge_page_order(h)) / 1024); + } + + seq_printf(m, "Hugetlb: %8lu kB\n", total / 1024); } int hugetlb_report_node_meminfo(int nid, char *buf) From 9092c71bb724dba2ecba849eae69e5c9d39bd3d2 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 31 Jan 2018 16:16:26 -0800 Subject: [PATCH 038/118] mm: use sc->priority for slab shrink targets Previously we were using the ratio of the number of lru pages scanned to the number of eligible lru pages to determine the number of slab objects to scan. The problem with this is that these two things have nothing to do with each other, so in slab heavy work loads where there is little to no page cache we can end up with the pages scanned being a very low number. This means that we reclaim next to no slab pages and waste a lot of time reclaiming small amounts of space. Consider the following scenario, where we have the following values and the rest of the memory usage is in slab Active: 58840 kB Inactive: 46860 kB Every time we do a get_scan_count() we do this scan = size >> sc->priority where sc->priority starts at DEF_PRIORITY, which is 12. The first loop through reclaim would result in a scan target of 2 pages to 11715 total inactive pages, and 3 pages to 14710 total active pages. This is a really really small target for a system that is entirely slab pages. And this is super optimistic, this assumes we even get to scan these pages. We don't increment sc->nr_scanned unless we 1) isolate the page, which assumes it's not in use, and 2) can lock the page. Under pressure these numbers could probably go down, I'm sure there's some random pages from daemons that aren't actually in use, so the targets get even smaller. Instead use sc->priority in the same way we use it to determine scan amounts for the lru's. This generally equates to pages. Consider the following slab_pages = (nr_objects * object_size) / PAGE_SIZE What we would like to do is scan = slab_pages >> sc->priority but we don't know the number of slab pages each shrinker controls, only the objects. However say that theoretically we knew how many pages a shrinker controlled, we'd still have to convert this to objects, which would look like the following scan = shrinker_pages >> sc->priority scan_objects = (PAGE_SIZE / object_size) * scan or written another way scan_objects = (shrinker_pages >> sc->priority) * (PAGE_SIZE / object_size) which can thus be written scan_objects = ((shrinker_pages * PAGE_SIZE) / object_size) >> sc->priority which is just scan_objects = nr_objects >> sc->priority We don't need to know exactly how many pages each shrinker represents, it's objects are all the information we need. Making this change allows us to place an appropriate amount of pressure on the shrinker pools for their relative size. Link: http://lkml.kernel.org/r/1510780549-6812-1-git-send-email-josef@toxicpanda.com Signed-off-by: Josef Bacik Acked-by: Johannes Weiner Acked-by: Dave Chinner Acked-by: Andrey Ryabinin Cc: Michal Hocko Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/trace/events/vmscan.h | 23 ++++++++--------- mm/vmscan.c | 47 ++++++++++------------------------- 2 files changed, 23 insertions(+), 47 deletions(-) diff --git a/include/trace/events/vmscan.h b/include/trace/events/vmscan.h index d70b53e65f43..e0b8b9173e1c 100644 --- a/include/trace/events/vmscan.h +++ b/include/trace/events/vmscan.h @@ -192,12 +192,12 @@ DEFINE_EVENT(mm_vmscan_direct_reclaim_end_template, mm_vmscan_memcg_softlimit_re TRACE_EVENT(mm_shrink_slab_start, TP_PROTO(struct shrinker *shr, struct shrink_control *sc, - long nr_objects_to_shrink, unsigned long pgs_scanned, - unsigned long lru_pgs, unsigned long cache_items, - unsigned long long delta, unsigned long total_scan), + long nr_objects_to_shrink, unsigned long cache_items, + unsigned long long delta, unsigned long total_scan, + int priority), - TP_ARGS(shr, sc, nr_objects_to_shrink, pgs_scanned, lru_pgs, - cache_items, delta, total_scan), + TP_ARGS(shr, sc, nr_objects_to_shrink, cache_items, delta, total_scan, + priority), TP_STRUCT__entry( __field(struct shrinker *, shr) @@ -205,11 +205,10 @@ TRACE_EVENT(mm_shrink_slab_start, __field(int, nid) __field(long, nr_objects_to_shrink) __field(gfp_t, gfp_flags) - __field(unsigned long, pgs_scanned) - __field(unsigned long, lru_pgs) __field(unsigned long, cache_items) __field(unsigned long long, delta) __field(unsigned long, total_scan) + __field(int, priority) ), TP_fast_assign( @@ -218,24 +217,22 @@ TRACE_EVENT(mm_shrink_slab_start, __entry->nid = sc->nid; __entry->nr_objects_to_shrink = nr_objects_to_shrink; __entry->gfp_flags = sc->gfp_mask; - __entry->pgs_scanned = pgs_scanned; - __entry->lru_pgs = lru_pgs; __entry->cache_items = cache_items; __entry->delta = delta; __entry->total_scan = total_scan; + __entry->priority = priority; ), - TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s pgs_scanned %ld lru_pgs %ld cache items %ld delta %lld total_scan %ld", + TP_printk("%pF %p: nid: %d objects to shrink %ld gfp_flags %s cache items %ld delta %lld total_scan %ld priority %d", __entry->shrink, __entry->shr, __entry->nid, __entry->nr_objects_to_shrink, show_gfp_flags(__entry->gfp_flags), - __entry->pgs_scanned, - __entry->lru_pgs, __entry->cache_items, __entry->delta, - __entry->total_scan) + __entry->total_scan, + __entry->priority) ); TRACE_EVENT(mm_shrink_slab_end, diff --git a/mm/vmscan.c b/mm/vmscan.c index 47d5ced51f2d..e73274a60b22 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -310,9 +310,7 @@ EXPORT_SYMBOL(unregister_shrinker); #define SHRINK_BATCH 128 static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, - struct shrinker *shrinker, - unsigned long nr_scanned, - unsigned long nr_eligible) + struct shrinker *shrinker, int priority) { unsigned long freed = 0; unsigned long long delta; @@ -337,9 +335,9 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, nr = atomic_long_xchg(&shrinker->nr_deferred[nid], 0); total_scan = nr; - delta = (4 * nr_scanned) / shrinker->seeks; - delta *= freeable; - do_div(delta, nr_eligible + 1); + delta = freeable >> priority; + delta *= 4; + do_div(delta, shrinker->seeks); total_scan += delta; if (total_scan < 0) { pr_err("shrink_slab: %pF negative objects to delete nr=%ld\n", @@ -373,8 +371,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, total_scan = freeable * 2; trace_mm_shrink_slab_start(shrinker, shrinkctl, nr, - nr_scanned, nr_eligible, - freeable, delta, total_scan); + freeable, delta, total_scan, priority); /* * Normally, we should not scan less than batch_size objects in one @@ -434,8 +431,7 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * @gfp_mask: allocation context * @nid: node whose slab caches to target * @memcg: memory cgroup whose slab caches to target - * @nr_scanned: pressure numerator - * @nr_eligible: pressure denominator + * @priority: the reclaim priority * * Call the shrink functions to age shrinkable caches. * @@ -447,20 +443,14 @@ static unsigned long do_shrink_slab(struct shrink_control *shrinkctl, * objects from the memory cgroup specified. Otherwise, only unaware * shrinkers are called. * - * @nr_scanned and @nr_eligible form a ratio that indicate how much of - * the available objects should be scanned. Page reclaim for example - * passes the number of pages scanned and the number of pages on the - * LRU lists that it considered on @nid, plus a bias in @nr_scanned - * when it encountered mapped pages. The ratio is further biased by - * the ->seeks setting of the shrink function, which indicates the - * cost to recreate an object relative to that of an LRU page. + * @priority is sc->priority, we take the number of objects and >> by priority + * in order to get the scan target. * * Returns the number of reclaimed slab objects. */ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, struct mem_cgroup *memcg, - unsigned long nr_scanned, - unsigned long nr_eligible) + int priority) { struct shrinker *shrinker; unsigned long freed = 0; @@ -468,9 +458,6 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (memcg && (!memcg_kmem_enabled() || !mem_cgroup_online(memcg))) return 0; - if (nr_scanned == 0) - nr_scanned = SWAP_CLUSTER_MAX; - if (!down_read_trylock(&shrinker_rwsem)) { /* * If we would return 0, our callers would understand that we @@ -501,7 +488,7 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, if (!(shrinker->flags & SHRINKER_NUMA_AWARE)) sc.nid = 0; - freed += do_shrink_slab(&sc, shrinker, nr_scanned, nr_eligible); + freed += do_shrink_slab(&sc, shrinker, priority); } up_read(&shrinker_rwsem); @@ -519,8 +506,7 @@ void drop_slab_node(int nid) freed = 0; do { - freed += shrink_slab(GFP_KERNEL, nid, memcg, - 1000, 1000); + freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); } while (freed > 10); } @@ -2615,14 +2601,12 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) reclaimed = sc->nr_reclaimed; scanned = sc->nr_scanned; - shrink_node_memcg(pgdat, memcg, sc, &lru_pages); node_lru_pages += lru_pages; if (memcg) shrink_slab(sc->gfp_mask, pgdat->node_id, - memcg, sc->nr_scanned - scanned, - lru_pages); + memcg, sc->priority); /* Record the group's reclaim efficiency */ vmpressure(sc->gfp_mask, memcg, false, @@ -2646,14 +2630,9 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) } } while ((memcg = mem_cgroup_iter(root, memcg, &reclaim))); - /* - * Shrink the slab caches in the same proportion that - * the eligible LRU pages were scanned. - */ if (global_reclaim(sc)) shrink_slab(sc->gfp_mask, pgdat->node_id, NULL, - sc->nr_scanned - nr_scanned, - node_lru_pages); + sc->priority); if (reclaim_state) { sc->nr_reclaimed += reclaim_state->reclaimed_slab; From 80b1f41c0957a9da3bab4fb9ae76dc886753a59b Mon Sep 17 00:00:00 2001 From: Pavel Tatashin Date: Wed, 31 Jan 2018 16:16:30 -0800 Subject: [PATCH 039/118] mm: split deferred_init_range into initializing and freeing parts In deferred_init_range() we initialize struct pages, and also free them to buddy allocator. We do it in separate loops, because buddy page is computed ahead, so we do not want to access a struct page that has not been initialized yet. There is still, however, a corner case where it is potentially possible to access uninitialized struct page: this is when buddy page is from the next memblock range. This patch fixes this problem by splitting deferred_init_range() into two functions: one to initialize struct pages, and another to free them. In addition, this patch brings the following improvements: - Get rid of __def_free() helper function. And simplifies loop logic by adding a new pfn validity check function: deferred_pfn_valid(). - Reduces number of variables that we track. So, there is a higher chance that we will avoid using stack to store/load variables inside hot loops. - Enables future multi-threading of these functions: do initialization in multiple threads, wait for all threads to finish, do freeing part in multithreading. Tested on x86 with 1T of memory to make sure no regressions are introduced. [akpm@linux-foundation.org: fix spello in comment] Link: http://lkml.kernel.org/r/20171107150446.32055-2-pasha.tatashin@oracle.com Signed-off-by: Pavel Tatashin Acked-by: Michal Hocko Cc: Steven Sistare Cc: Daniel Jordan Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 150 +++++++++++++++++++++++++----------------------- 1 file changed, 78 insertions(+), 72 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 76c9688b6a0a..a73cffe287a5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1457,92 +1457,87 @@ static inline void __init pgdat_init_report_one_done(void) } /* - * Helper for deferred_init_range, free the given range, reset the counters, and - * return number of pages freed. + * Returns true if page needs to be initialized or freed to buddy allocator. + * + * First we check if pfn is valid on architectures where it is possible to have + * holes within pageblock_nr_pages. On systems where it is not possible, this + * function is optimized out. + * + * Then, we check if a current large page is valid by only checking the validity + * of the head pfn. + * + * Finally, meminit_pfn_in_nid is checked on systems where pfns can interleave + * within a node: a pfn is between start and end of a node, but does not belong + * to this memory node. */ -static inline unsigned long __init __def_free(unsigned long *nr_free, - unsigned long *free_base_pfn, - struct page **page) +static inline bool __init +deferred_pfn_valid(int nid, unsigned long pfn, + struct mminit_pfnnid_cache *nid_init_state) { - unsigned long nr = *nr_free; - - deferred_free_range(*free_base_pfn, nr); - *free_base_pfn = 0; - *nr_free = 0; - *page = NULL; - - return nr; + if (!pfn_valid_within(pfn)) + return false; + if (!(pfn & (pageblock_nr_pages - 1)) && !pfn_valid(pfn)) + return false; + if (!meminit_pfn_in_nid(pfn, nid, nid_init_state)) + return false; + return true; } -static unsigned long __init deferred_init_range(int nid, int zid, - unsigned long start_pfn, - unsigned long end_pfn) +/* + * Free pages to buddy allocator. Try to free aligned pages in + * pageblock_nr_pages sizes. + */ +static void __init deferred_free_pages(int nid, int zid, unsigned long pfn, + unsigned long end_pfn) { struct mminit_pfnnid_cache nid_init_state = { }; unsigned long nr_pgmask = pageblock_nr_pages - 1; - unsigned long free_base_pfn = 0; - unsigned long nr_pages = 0; unsigned long nr_free = 0; - struct page *page = NULL; - unsigned long pfn; - /* - * First we check if pfn is valid on architectures where it is possible - * to have holes within pageblock_nr_pages. On systems where it is not - * possible, this function is optimized out. - * - * Then, we check if a current large page is valid by only checking the - * validity of the head pfn. - * - * meminit_pfn_in_nid is checked on systems where pfns can interleave - * within a node: a pfn is between start and end of a node, but does not - * belong to this memory node. - * - * Finally, we minimize pfn page lookups and scheduler checks by - * performing it only once every pageblock_nr_pages. - * - * We do it in two loops: first we initialize struct page, than free to - * buddy allocator, becuse while we are freeing pages we can access - * pages that are ahead (computing buddy page in __free_one_page()). - */ - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) - continue; - if ((pfn & nr_pgmask) || pfn_valid(pfn)) { - if (meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - if (page && (pfn & nr_pgmask)) - page++; - else - page = pfn_to_page(pfn); - __init_single_page(page, pfn, zid, nid); - cond_resched(); - } - } - } - - page = NULL; - for (pfn = start_pfn; pfn < end_pfn; pfn++) { - if (!pfn_valid_within(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!(pfn & nr_pgmask) && !pfn_valid(pfn)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (!meminit_pfn_in_nid(pfn, nid, &nid_init_state)) { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - } else if (page && (pfn & nr_pgmask)) { - page++; - nr_free++; - } else { - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); - page = pfn_to_page(pfn); - free_base_pfn = pfn; + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + deferred_free_range(pfn - nr_free, nr_free); + nr_free = 0; + } else if (!(pfn & nr_pgmask)) { + deferred_free_range(pfn - nr_free, nr_free); nr_free = 1; cond_resched(); + } else { + nr_free++; } } /* Free the last block of pages to allocator */ - nr_pages += __def_free(&nr_free, &free_base_pfn, &page); + deferred_free_range(pfn - nr_free, nr_free); +} - return nr_pages; +/* + * Initialize struct pages. We minimize pfn page lookups and scheduler checks + * by performing it only once every pageblock_nr_pages. + * Return number of pages initialized. + */ +static unsigned long __init deferred_init_pages(int nid, int zid, + unsigned long pfn, + unsigned long end_pfn) +{ + struct mminit_pfnnid_cache nid_init_state = { }; + unsigned long nr_pgmask = pageblock_nr_pages - 1; + unsigned long nr_pages = 0; + struct page *page = NULL; + + for (; pfn < end_pfn; pfn++) { + if (!deferred_pfn_valid(nid, pfn, &nid_init_state)) { + page = NULL; + continue; + } else if (!page || !(pfn & nr_pgmask)) { + page = pfn_to_page(pfn); + cond_resched(); + } else { + page++; + } + __init_single_page(page, pfn, zid, nid); + nr_pages++; + } + return (nr_pages); } /* Initialise remaining memory on a node */ @@ -1582,10 +1577,21 @@ static int __init deferred_init_memmap(void *data) } first_init_pfn = max(zone->zone_start_pfn, first_init_pfn); + /* + * Initialize and free pages. We do it in two loops: first we initialize + * struct page, than free to buddy allocator, because while we are + * freeing pages we can access pages that are ahead (computing buddy + * page in __free_one_page()). + */ for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); - nr_pages += deferred_init_range(nid, zid, spfn, epfn); + nr_pages += deferred_init_pages(nid, zid, spfn, epfn); + } + for_each_free_mem_range(i, nid, MEMBLOCK_NONE, &spa, &epa, NULL) { + spfn = max_t(unsigned long, first_init_pfn, PFN_UP(spa)); + epfn = min_t(unsigned long, zone_end_pfn(zone), PFN_DOWN(epa)); + deferred_free_pages(nid, zid, spfn, epfn); } /* Sanity check that the next zone really is unpopulated */ From 2b9fceb3b47b7c44fb04eef068f441e7b18daa68 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:16:34 -0800 Subject: [PATCH 040/118] mm/filemap.c: remove include of hardirq.h in_atomic() has been moved to include/linux/preempt.h, and the filemap.c doesn't use in_atomic() directly at all, so it sounds unnecessary to include hardirq.h. Link: http://lkml.kernel.org/r/1509985319-38633-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Reviewed-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/filemap.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/filemap.c b/mm/filemap.c index ee83baaf855d..693f62212a59 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -31,7 +31,6 @@ #include #include #include -#include /* for BUG_ON(!in_atomic()) only */ #include #include #include From c9019e9bf42e66d028d70d2da6206cad4dd9250d Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:37 -0800 Subject: [PATCH 041/118] mm: memcontrol: eliminate raw access to stat and event counters Replace all raw 'this_cpu_' modifications of the stat and event per-cpu counters with API functions such as mod_memcg_state(). This makes the code easier to read, but is also in preparation for the next patch, which changes the per-cpu implementation of those counters. Link: http://lkml.kernel.org/r/20171103153336.24044-1-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 31 +++++++++++++------- mm/memcontrol.c | 59 ++++++++++++++++---------------------- 2 files changed, 45 insertions(+), 45 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 69966c461d1c..2c80b69dd266 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -272,13 +272,6 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -static inline void mem_cgroup_event(struct mem_cgroup *memcg, - enum memcg_event_item event) -{ - this_cpu_inc(memcg->stat->events[event]); - cgroup_file_notify(&memcg->events_file); -} - bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, @@ -627,15 +620,23 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, gfp_t gfp_mask, unsigned long *total_scanned); +/* idx can be of type enum memcg_event_item or vm_event_item */ +static inline void __count_memcg_events(struct mem_cgroup *memcg, + int idx, unsigned long count) +{ + if (!mem_cgroup_disabled()) + __this_cpu_add(memcg->stat->events[idx], count); +} + +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_events(struct mem_cgroup *memcg, - enum vm_event_item idx, - unsigned long count) + int idx, unsigned long count) { if (!mem_cgroup_disabled()) this_cpu_add(memcg->stat->events[idx], count); } -/* idx can be of type enum memcg_stat_item or node_stat_item */ +/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_page_event(struct page *page, int idx) { @@ -654,12 +655,20 @@ static inline void count_memcg_event_mm(struct mm_struct *mm, rcu_read_lock(); memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); if (likely(memcg)) { - this_cpu_inc(memcg->stat->events[idx]); + count_memcg_events(memcg, idx, 1); if (idx == OOM_KILL) cgroup_file_notify(&memcg->events_file); } rcu_read_unlock(); } + +static inline void mem_cgroup_event(struct mem_cgroup *memcg, + enum memcg_event_item event) +{ + count_memcg_events(memcg, event, 1); + cgroup_file_notify(&memcg->events_file); +} + #ifdef CONFIG_TRANSPARENT_HUGEPAGE void mem_cgroup_split_huge_fixup(struct page *head); #endif diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 9011997d8a5c..23841af1d756 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -586,23 +586,23 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, * counted as CACHE even if it's on ANON LRU. */ if (PageAnon(page)) - __this_cpu_add(memcg->stat->count[MEMCG_RSS], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS, nr_pages); else { - __this_cpu_add(memcg->stat->count[MEMCG_CACHE], nr_pages); + __mod_memcg_state(memcg, MEMCG_CACHE, nr_pages); if (PageSwapBacked(page)) - __this_cpu_add(memcg->stat->count[NR_SHMEM], nr_pages); + __mod_memcg_state(memcg, NR_SHMEM, nr_pages); } if (compound) { VM_BUG_ON_PAGE(!PageTransHuge(page), page); - __this_cpu_add(memcg->stat->count[MEMCG_RSS_HUGE], nr_pages); + __mod_memcg_state(memcg, MEMCG_RSS_HUGE, nr_pages); } /* pagein of a big page is an event. So, ignore page size */ if (nr_pages > 0) - __this_cpu_inc(memcg->stat->events[PGPGIN]); + __count_memcg_events(memcg, PGPGIN, 1); else { - __this_cpu_inc(memcg->stat->events[PGPGOUT]); + __count_memcg_events(memcg, PGPGOUT, 1); nr_pages = -nr_pages; /* for event */ } @@ -2415,18 +2415,11 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) head[i].mem_cgroup = head->mem_cgroup; - __this_cpu_sub(head->mem_cgroup->stat->count[MEMCG_RSS_HUGE], - HPAGE_PMD_NR); + __mod_memcg_state(head->mem_cgroup, MEMCG_RSS_HUGE, -HPAGE_PMD_NR); } #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ #ifdef CONFIG_MEMCG_SWAP -static void mem_cgroup_swap_statistics(struct mem_cgroup *memcg, - int nr_entries) -{ - this_cpu_add(memcg->stat->count[MEMCG_SWAP], nr_entries); -} - /** * mem_cgroup_move_swap_account - move swap charge and swap_cgroup's record. * @entry: swap entry to be moved @@ -2450,8 +2443,8 @@ static int mem_cgroup_move_swap_account(swp_entry_t entry, new_id = mem_cgroup_id(to); if (swap_cgroup_cmpxchg(entry, old_id, new_id) == old_id) { - mem_cgroup_swap_statistics(from, -1); - mem_cgroup_swap_statistics(to, 1); + mod_memcg_state(from, MEMCG_SWAP, -1); + mod_memcg_state(to, MEMCG_SWAP, 1); return 0; } return -EINVAL; @@ -4584,8 +4577,8 @@ static int mem_cgroup_move_account(struct page *page, spin_lock_irqsave(&from->move_lock, flags); if (!anon && page_mapped(page)) { - __this_cpu_sub(from->stat->count[NR_FILE_MAPPED], nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_MAPPED], nr_pages); + __mod_memcg_state(from, NR_FILE_MAPPED, -nr_pages); + __mod_memcg_state(to, NR_FILE_MAPPED, nr_pages); } /* @@ -4597,16 +4590,14 @@ static int mem_cgroup_move_account(struct page *page, struct address_space *mapping = page_mapping(page); if (mapping_cap_account_dirty(mapping)) { - __this_cpu_sub(from->stat->count[NR_FILE_DIRTY], - nr_pages); - __this_cpu_add(to->stat->count[NR_FILE_DIRTY], - nr_pages); + __mod_memcg_state(from, NR_FILE_DIRTY, -nr_pages); + __mod_memcg_state(to, NR_FILE_DIRTY, nr_pages); } } if (PageWriteback(page)) { - __this_cpu_sub(from->stat->count[NR_WRITEBACK], nr_pages); - __this_cpu_add(to->stat->count[NR_WRITEBACK], nr_pages); + __mod_memcg_state(from, NR_WRITEBACK, -nr_pages); + __mod_memcg_state(to, NR_WRITEBACK, nr_pages); } /* @@ -5642,11 +5633,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) } local_irq_save(flags); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS], ug->nr_anon); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_CACHE], ug->nr_file); - __this_cpu_sub(ug->memcg->stat->count[MEMCG_RSS_HUGE], ug->nr_huge); - __this_cpu_sub(ug->memcg->stat->count[NR_SHMEM], ug->nr_shmem); - __this_cpu_add(ug->memcg->stat->events[PGPGOUT], ug->pgpgout); + __mod_memcg_state(ug->memcg, MEMCG_RSS, -ug->nr_anon); + __mod_memcg_state(ug->memcg, MEMCG_CACHE, -ug->nr_file); + __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); + __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); + __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); @@ -5874,7 +5865,7 @@ bool mem_cgroup_charge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) if (in_softirq()) gfp_mask = GFP_NOWAIT; - this_cpu_add(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, nr_pages); if (try_charge(memcg, gfp_mask, nr_pages) == 0) return true; @@ -5895,7 +5886,7 @@ void mem_cgroup_uncharge_skmem(struct mem_cgroup *memcg, unsigned int nr_pages) return; } - this_cpu_sub(memcg->stat->count[MEMCG_SOCK], nr_pages); + mod_memcg_state(memcg, MEMCG_SOCK, -nr_pages); refill_stock(memcg, nr_pages); } @@ -6019,7 +6010,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) oldid = swap_cgroup_record(entry, mem_cgroup_id(swap_memcg), nr_entries); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(swap_memcg, nr_entries); + mod_memcg_state(swap_memcg, MEMCG_SWAP, nr_entries); page->mem_cgroup = NULL; @@ -6085,7 +6076,7 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) mem_cgroup_id_get_many(memcg, nr_pages - 1); oldid = swap_cgroup_record(entry, mem_cgroup_id(memcg), nr_pages); VM_BUG_ON_PAGE(oldid, page); - mem_cgroup_swap_statistics(memcg, nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, nr_pages); return 0; } @@ -6113,7 +6104,7 @@ void mem_cgroup_uncharge_swap(swp_entry_t entry, unsigned int nr_pages) else page_counter_uncharge(&memcg->memsw, nr_pages); } - mem_cgroup_swap_statistics(memcg, -nr_pages); + mod_memcg_state(memcg, MEMCG_SWAP, -nr_pages); mem_cgroup_id_put_many(memcg, nr_pages); } rcu_read_unlock(); From 284542656e22c43fdada8c8cc0ca9ede8453eed7 Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:41 -0800 Subject: [PATCH 042/118] mm: memcontrol: implement lruvec stat functions on top of each other The implementation of the lruvec stat functions and their variants for accounting through a page, or accounting from a preemptible context, are mostly identical and needlessly repetitive. Implement the lruvec_page functions by looking up the page's lruvec and then using the lruvec function. Implement the functions for preemptible contexts by disabling preemption before calling the atomic context functions. Link: http://lkml.kernel.org/r/20171103153336.24044-2-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 44 +++++++++++++++++++------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 2c80b69dd266..1ffc54ac4cc9 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -569,51 +569,51 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec, { struct mem_cgroup_per_node *pn; + /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); + if (mem_cgroup_disabled()) return; + pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); + + /* Update memcg */ __mod_memcg_state(pn->memcg, idx, val); + + /* Update lruvec */ __this_cpu_add(pn->lruvec_stat->count[idx], val); } static inline void mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(lruvec_pgdat(lruvec), idx, val); - if (mem_cgroup_disabled()) - return; - pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - mod_memcg_state(pn->memcg, idx, val); - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_state(lruvec, idx, val); + preempt_enable(); } static inline void __mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; + pg_data_t *pgdat = page_pgdat(page); + struct lruvec *lruvec; - __mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) + /* Untracked pages have no memcg, no lruvec. Update only the node */ + if (!page->mem_cgroup) { + __mod_node_page_state(pgdat, idx, val); return; - __mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - __this_cpu_add(pn->lruvec_stat->count[idx], val); + } + + lruvec = mem_cgroup_lruvec(pgdat, page->mem_cgroup); + __mod_lruvec_state(lruvec, idx, val); } static inline void mod_lruvec_page_state(struct page *page, enum node_stat_item idx, int val) { - struct mem_cgroup_per_node *pn; - - mod_node_page_state(page_pgdat(page), idx, val); - if (mem_cgroup_disabled() || !page->mem_cgroup) - return; - mod_memcg_state(page->mem_cgroup, idx, val); - pn = page->mem_cgroup->nodeinfo[page_to_nid(page)]; - this_cpu_add(pn->lruvec_stat->count[idx], val); + preempt_disable(); + __mod_lruvec_page_state(page, idx, val); + preempt_enable(); } unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, From a983b5ebee57209c99f68c8327072f25e0e6e3da Mon Sep 17 00:00:00 2001 From: Johannes Weiner Date: Wed, 31 Jan 2018 16:16:45 -0800 Subject: [PATCH 043/118] mm: memcontrol: fix excessive complexity in memory.stat reporting We've seen memory.stat reads in top-level cgroups take up to fourteen seconds during a userspace bug that created tens of thousands of ghost cgroups pinned by lingering page cache. Even with a more reasonable number of cgroups, aggregating memory.stat is unnecessarily heavy. The complexity is this: nr_cgroups * nr_stat_items * nr_possible_cpus where the stat items are ~70 at this point. With 128 cgroups and 128 CPUs - decent, not enormous setups - reading the top-level memory.stat has to aggregate over a million per-cpu counters. This doesn't scale. Instead of spreading the source of truth across all CPUs, use the per-cpu counters merely to batch updates to shared atomic counters. This is the same as the per-cpu stocks we use for charging memory to the shared atomic page_counters, and also the way the global vmstat counters are implemented. Vmstat has elaborate spilling thresholds that depend on the number of CPUs, amount of memory, and memory pressure - carefully balancing the cost of counter updates with the amount of per-cpu error. That's because the vmstat counters are system-wide, but also used for decisions inside the kernel (e.g. NR_FREE_PAGES in the allocator). Neither is true for the memory controller. Use the same static batch size we already use for page_counter updates during charging. The per-cpu error in the stats will be 128k, which is an acceptable ratio of cores to memory accounting granularity. [hannes@cmpxchg.org: fix warning in __this_cpu_xchg() calls] Link: http://lkml.kernel.org/r/20171201135750.GB8097@cmpxchg.org Link: http://lkml.kernel.org/r/20171103153336.24044-3-hannes@cmpxchg.org Signed-off-by: Johannes Weiner Acked-by: Vladimir Davydov Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 96 ++++++++++++++++++++++------------- mm/memcontrol.c | 101 +++++++++++++++++++------------------ 2 files changed, 113 insertions(+), 84 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 1ffc54ac4cc9..882046863581 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -108,7 +108,10 @@ struct lruvec_stat { */ struct mem_cgroup_per_node { struct lruvec lruvec; - struct lruvec_stat __percpu *lruvec_stat; + + struct lruvec_stat __percpu *lruvec_stat_cpu; + atomic_long_t lruvec_stat[NR_VM_NODE_STAT_ITEMS]; + unsigned long lru_zone_size[MAX_NR_ZONES][NR_LRU_LISTS]; struct mem_cgroup_reclaim_iter iter[DEF_PRIORITY + 1]; @@ -227,10 +230,10 @@ struct mem_cgroup { spinlock_t move_lock; struct task_struct *move_lock_task; unsigned long move_lock_flags; - /* - * percpu counter. - */ - struct mem_cgroup_stat_cpu __percpu *stat; + + struct mem_cgroup_stat_cpu __percpu *stat_cpu; + atomic_long_t stat[MEMCG_NR_STAT]; + atomic_long_t events[MEMCG_NR_EVENTS]; unsigned long socket_pressure; @@ -265,6 +268,12 @@ struct mem_cgroup { /* WARNING: nodeinfo must be the last member here */ }; +/* + * size of first charge trial. "32" comes from vmscan.c's magic value. + * TODO: maybe necessary to use big numbers in big irons. + */ +#define MEMCG_CHARGE_BATCH 32U + extern struct mem_cgroup *root_mem_cgroup; static inline bool mem_cgroup_disabled(void) @@ -485,32 +494,38 @@ void unlock_page_memcg(struct page *page); static inline unsigned long memcg_page_state(struct mem_cgroup *memcg, int idx) { - long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + long x = atomic_long_read(&memcg->stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void __mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->count[idx], val); + long x; + + if (mem_cgroup_disabled()) + return; + + x = val + __this_cpu_read(memcg->stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->stat[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->count[idx], x); } /* idx can be of type enum memcg_stat_item or node_stat_item */ static inline void mod_memcg_state(struct mem_cgroup *memcg, int idx, int val) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->count[idx], val); + preempt_disable(); + __mod_memcg_state(memcg, idx, val); + preempt_enable(); } /** @@ -548,26 +563,25 @@ static inline unsigned long lruvec_page_state(struct lruvec *lruvec, enum node_stat_item idx) { struct mem_cgroup_per_node *pn; - long val = 0; - int cpu; + long x; if (mem_cgroup_disabled()) return node_page_state(lruvec_pgdat(lruvec), idx); pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); - for_each_possible_cpu(cpu) - val += per_cpu(pn->lruvec_stat->count[idx], cpu); - - if (val < 0) - val = 0; - - return val; + x = atomic_long_read(&pn->lruvec_stat[idx]); +#ifdef CONFIG_SMP + if (x < 0) + x = 0; +#endif + return x; } static inline void __mod_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, int val) { struct mem_cgroup_per_node *pn; + long x; /* Update node */ __mod_node_page_state(lruvec_pgdat(lruvec), idx, val); @@ -581,7 +595,12 @@ static inline void __mod_lruvec_state(struct lruvec *lruvec, __mod_memcg_state(pn->memcg, idx, val); /* Update lruvec */ - __this_cpu_add(pn->lruvec_stat->count[idx], val); + x = val + __this_cpu_read(pn->lruvec_stat_cpu->count[idx]); + if (unlikely(abs(x) > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &pn->lruvec_stat[idx]); + x = 0; + } + __this_cpu_write(pn->lruvec_stat_cpu->count[idx], x); } static inline void mod_lruvec_state(struct lruvec *lruvec, @@ -624,16 +643,25 @@ unsigned long mem_cgroup_soft_limit_reclaim(pg_data_t *pgdat, int order, static inline void __count_memcg_events(struct mem_cgroup *memcg, int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - __this_cpu_add(memcg->stat->events[idx], count); + unsigned long x; + + if (mem_cgroup_disabled()) + return; + + x = count + __this_cpu_read(memcg->stat_cpu->events[idx]); + if (unlikely(x > MEMCG_CHARGE_BATCH)) { + atomic_long_add(x, &memcg->events[idx]); + x = 0; + } + __this_cpu_write(memcg->stat_cpu->events[idx], x); } -/* idx can be of type enum memcg_event_item or vm_event_item */ static inline void count_memcg_events(struct mem_cgroup *memcg, int idx, unsigned long count) { - if (!mem_cgroup_disabled()) - this_cpu_add(memcg->stat->events[idx], count); + preempt_disable(); + __count_memcg_events(memcg, idx, count); + preempt_enable(); } /* idx can be of type enum memcg_event_item or vm_event_item */ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 23841af1d756..51d398f1363c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -542,39 +542,10 @@ mem_cgroup_largest_soft_limit_node(struct mem_cgroup_tree_per_node *mctz) return mz; } -/* - * Return page count for single (non recursive) @memcg. - * - * Implementation Note: reading percpu statistics for memcg. - * - * Both of vmstat[] and percpu_counter has threshold and do periodic - * synchronization to implement "quick" read. There are trade-off between - * reading cost and precision of value. Then, we may have a chance to implement - * a periodic synchronization of counter in memcg's counter. - * - * But this _read() function is used for user interface now. The user accounts - * memory usage by memory cgroup and he _always_ requires exact value because - * he accounts memory. Even if we provide quick-and-fuzzy read, we always - * have to visit all online cpus and make sum. So, for now, unnecessary - * synchronization is not implemented. (just implemented for cpu hotplug) - * - * If there are kernel internal actions which can make use of some not-exact - * value, and reading all cpu value can be performance bottleneck in some - * common workload, threshold and synchronization as vmstat[] should be - * implemented. - * - * The parameter idx can be of type enum memcg_event_item or vm_event_item. - */ - static unsigned long memcg_sum_events(struct mem_cgroup *memcg, int event) { - unsigned long val = 0; - int cpu; - - for_each_possible_cpu(cpu) - val += per_cpu(memcg->stat->events[event], cpu); - return val; + return atomic_long_read(&memcg->events[event]); } static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, @@ -606,7 +577,7 @@ static void mem_cgroup_charge_statistics(struct mem_cgroup *memcg, nr_pages = -nr_pages; /* for event */ } - __this_cpu_add(memcg->stat->nr_page_events, nr_pages); + __this_cpu_add(memcg->stat_cpu->nr_page_events, nr_pages); } unsigned long mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, @@ -642,8 +613,8 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, { unsigned long val, next; - val = __this_cpu_read(memcg->stat->nr_page_events); - next = __this_cpu_read(memcg->stat->targets[target]); + val = __this_cpu_read(memcg->stat_cpu->nr_page_events); + next = __this_cpu_read(memcg->stat_cpu->targets[target]); /* from time_after() in jiffies.h */ if ((long)(next - val) < 0) { switch (target) { @@ -659,7 +630,7 @@ static bool mem_cgroup_event_ratelimit(struct mem_cgroup *memcg, default: break; } - __this_cpu_write(memcg->stat->targets[target], next); + __this_cpu_write(memcg->stat_cpu->targets[target], next); return true; } return false; @@ -1707,11 +1678,6 @@ void unlock_page_memcg(struct page *page) } EXPORT_SYMBOL(unlock_page_memcg); -/* - * size of first charge trial. "32" comes from vmscan.c's magic value. - * TODO: maybe necessary to use big numbers in big irons. - */ -#define CHARGE_BATCH 32U struct memcg_stock_pcp { struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -1739,7 +1705,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) unsigned long flags; bool ret = false; - if (nr_pages > CHARGE_BATCH) + if (nr_pages > MEMCG_CHARGE_BATCH) return ret; local_irq_save(flags); @@ -1808,7 +1774,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) } stock->nr_pages += nr_pages; - if (stock->nr_pages > CHARGE_BATCH) + if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); local_irq_restore(flags); @@ -1858,9 +1824,44 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) static int memcg_hotplug_cpu_dead(unsigned int cpu) { struct memcg_stock_pcp *stock; + struct mem_cgroup *memcg; stock = &per_cpu(memcg_stock, cpu); drain_stock(stock); + + for_each_mem_cgroup(memcg) { + int i; + + for (i = 0; i < MEMCG_NR_STAT; i++) { + int nid; + long x; + + x = this_cpu_xchg(memcg->stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &memcg->stat[i]); + + if (i >= NR_VM_NODE_STAT_ITEMS) + continue; + + for_each_node(nid) { + struct mem_cgroup_per_node *pn; + + pn = mem_cgroup_nodeinfo(memcg, nid); + x = this_cpu_xchg(pn->lruvec_stat_cpu->count[i], 0); + if (x) + atomic_long_add(x, &pn->lruvec_stat[i]); + } + } + + for (i = 0; i < MEMCG_NR_EVENTS; i++) { + long x; + + x = this_cpu_xchg(memcg->stat_cpu->events[i], 0); + if (x) + atomic_long_add(x, &memcg->events[i]); + } + } + return 0; } @@ -1881,7 +1882,7 @@ static void high_work_func(struct work_struct *work) struct mem_cgroup *memcg; memcg = container_of(work, struct mem_cgroup, high_work); - reclaim_high(memcg, CHARGE_BATCH, GFP_KERNEL); + reclaim_high(memcg, MEMCG_CHARGE_BATCH, GFP_KERNEL); } /* @@ -1905,7 +1906,7 @@ void mem_cgroup_handle_over_high(void) static int try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, unsigned int nr_pages) { - unsigned int batch = max(CHARGE_BATCH, nr_pages); + unsigned int batch = max(MEMCG_CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; @@ -4161,8 +4162,8 @@ static int alloc_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) if (!pn) return 1; - pn->lruvec_stat = alloc_percpu(struct lruvec_stat); - if (!pn->lruvec_stat) { + pn->lruvec_stat_cpu = alloc_percpu(struct lruvec_stat); + if (!pn->lruvec_stat_cpu) { kfree(pn); return 1; } @@ -4180,7 +4181,7 @@ static void free_mem_cgroup_per_node_info(struct mem_cgroup *memcg, int node) { struct mem_cgroup_per_node *pn = memcg->nodeinfo[node]; - free_percpu(pn->lruvec_stat); + free_percpu(pn->lruvec_stat_cpu); kfree(pn); } @@ -4190,7 +4191,7 @@ static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); - free_percpu(memcg->stat); + free_percpu(memcg->stat_cpu); kfree(memcg); } @@ -4219,8 +4220,8 @@ static struct mem_cgroup *mem_cgroup_alloc(void) if (memcg->id.id < 0) goto fail; - memcg->stat = alloc_percpu(struct mem_cgroup_stat_cpu); - if (!memcg->stat) + memcg->stat_cpu = alloc_percpu(struct mem_cgroup_stat_cpu); + if (!memcg->stat_cpu) goto fail; for_each_node(node) @@ -5638,7 +5639,7 @@ static void uncharge_batch(const struct uncharge_gather *ug) __mod_memcg_state(ug->memcg, MEMCG_RSS_HUGE, -ug->nr_huge); __mod_memcg_state(ug->memcg, NR_SHMEM, -ug->nr_shmem); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); - __this_cpu_add(ug->memcg->stat->nr_page_events, nr_pages); + __this_cpu_add(ug->memcg->stat_cpu->nr_page_events, nr_pages); memcg_check_events(ug->memcg, ug->dummy_page); local_irq_restore(flags); From 8e33771ca41245a7c7f7a3c84f5cbd6625620a89 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Wed, 31 Jan 2018 16:16:48 -0800 Subject: [PATCH 044/118] mm/page_owner.c: use PTR_ERR_OR_ZERO() Fix ptr_ret.cocci warnings: mm/page_owner.c:639:1-3: WARNING: PTR_ERR_OR_ZERO can be used Use PTR_ERR_OR_ZERO rather than if(IS_ERR(...)) + PTR_ERR Generated by: scripts/coccinelle/api/ptr_ret.cocci Link: http://lkml.kernel.org/r/1511824101-9597-1-git-send-email-gomonovych@gmail.com Signed-off-by: Vasyl Gomonovych Acked-by: Vlastimil Babka Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 270a8219ccd0..06a0055f45a6 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -635,9 +635,7 @@ static int __init pageowner_init(void) dentry = debugfs_create_file("page_owner", S_IRUSR, NULL, NULL, &proc_page_owner_operations); - if (IS_ERR(dentry)) - return PTR_ERR(dentry); - return 0; + return PTR_ERR_OR_ZERO(dentry); } late_initcall(pageowner_init) From 48128397b04679717cfd419d55ec86456b84eb61 Mon Sep 17 00:00:00 2001 From: Jiankang Chen Date: Wed, 31 Jan 2018 16:16:52 -0800 Subject: [PATCH 045/118] mm/page_alloc.c: fix comment in __get_free_pages() __get_free_pages() will return a virtual address, but it is not just a 32-bit address, for example on a 64-bit system. And this comment really confuses new readers of mm. Link: http://lkml.kernel.org/r/1511780964-64864-1-git-send-email-chenjiankang1@huawei.com Signed-off-by: Jiankang Chen Reported-by: Hanjun Guo Cc: Mel Gorman Cc: Johannes Weiner Cc: Yisheng Xie Cc: Kefeng Wang Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a73cffe287a5..b411f97dfb25 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4278,7 +4278,7 @@ unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order) struct page *page; /* - * __get_free_pages() returns a 32-bit address, which cannot represent + * __get_free_pages() returns a virtual address, which cannot represent * a highmem page */ VM_BUG_ON((gfp_mask & __GFP_HIGHMEM) != 0); From e496612c5130567fc9d5f1969ca4b86665aa3cbb Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Wed, 31 Jan 2018 16:16:55 -0800 Subject: [PATCH 046/118] mm: do not stall register_shrinker() Shakeel Butt reported he has observed in production systems that the job loader gets stuck for 10s of seconds while doing a mount operation. It turns out that it was stuck in register_shrinker() because some unrelated job was under memory pressure and was spending time in shrink_slab(). Machines have a lot of shrinkers registered and jobs under memory pressure have to traverse all of those memcg-aware shrinkers and affect unrelated jobs which want to register their own shrinkers. To solve the issue, this patch simply bails out slab shrinking if it is found that someone wants to register a shrinker in parallel. A downside is it could cause unfair shrinking between shrinkers. However, it should be rare and we can add compilcated logic if we find it's not enough. [akpm@linux-foundation.org: tweak code comment] Link: http://lkml.kernel.org/r/20171115005602.GB23810@bbox Link: http://lkml.kernel.org/r/1511481899-20335-1-git-send-email-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Shakeel Butt Reported-by: Shakeel Butt Tested-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tetsuo Handa Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/mm/vmscan.c b/mm/vmscan.c index e73274a60b22..153e0795f4f0 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -489,6 +489,15 @@ static unsigned long shrink_slab(gfp_t gfp_mask, int nid, sc.nid = 0; freed += do_shrink_slab(&sc, shrinker, priority); + /* + * Bail out if someone want to register a new shrinker to + * prevent the regsitration from being stalled for long periods + * by parallel ongoing shrinking. + */ + if (rwsem_is_contended(&shrinker_rwsem)) { + freed = freed ? : 1; + break; + } } up_read(&shrinker_rwsem); From 235266b8e11c9db12497bdfc6d5e9100f3434c24 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 31 Jan 2018 16:16:59 -0800 Subject: [PATCH 047/118] selftests/vm: move 128TB mmap boundary test to generic directory Architectures like PPC64 support mmap hint address based large address space selection. This test can be run on those architectures too. Move the test from the x86 selftests to selftest/vm so that other architectures can use it too. We also add a few new test scenarios in this patch. We do test a few boundary conditions before we do a high address mmap. PPC64 uses the address limit to validate the address in the fault path. We had bugs in this area w.r.t SLB fault handling before we updated the addess limit. We also touch the allocated space to make sure we don't have any bugs in the fault handling path. [akpm@linux-foundation.org: restore tools/testing/selftests/vm/Makefile alpha ordering] Link: http://lkml.kernel.org/r/20171123165226.32582-1-aneesh.kumar@linux.vnet.ibm.com Signed-off-by: Aneesh Kumar K.V Cc: "Kirill A . Shutemov" Cc: Ingo Molnar Cc: Thomas Gleixner Cc: "H. Peter Anvin" Cc: Shuah Khan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/vm/Makefile | 5 +- tools/testing/selftests/vm/run_vmtests | 11 + tools/testing/selftests/vm/va_128TBswitch.c | 297 ++++++++++++++++++++ tools/testing/selftests/x86/5lvl.c | 177 ------------ 4 files changed, 311 insertions(+), 179 deletions(-) create mode 100644 tools/testing/selftests/vm/va_128TBswitch.c delete mode 100644 tools/testing/selftests/x86/5lvl.c diff --git a/tools/testing/selftests/vm/Makefile b/tools/testing/selftests/vm/Makefile index 7f45806bd863..fdefa2295ddc 100644 --- a/tools/testing/selftests/vm/Makefile +++ b/tools/testing/selftests/vm/Makefile @@ -8,17 +8,18 @@ endif CFLAGS = -Wall -I ../../../../usr/include $(EXTRA_CFLAGS) LDLIBS = -lrt TEST_GEN_FILES = compaction_test +TEST_GEN_FILES += gup_benchmark TEST_GEN_FILES += hugepage-mmap TEST_GEN_FILES += hugepage-shm TEST_GEN_FILES += map_hugetlb +TEST_GEN_FILES += mlock-random-test TEST_GEN_FILES += mlock2-tests TEST_GEN_FILES += on-fault-limit TEST_GEN_FILES += thuge-gen TEST_GEN_FILES += transhuge-stress TEST_GEN_FILES += userfaultfd -TEST_GEN_FILES += mlock-random-test +TEST_GEN_FILES += va_128TBswitch TEST_GEN_FILES += virtual_address_range -TEST_GEN_FILES += gup_benchmark TEST_PROGS := run_vmtests diff --git a/tools/testing/selftests/vm/run_vmtests b/tools/testing/selftests/vm/run_vmtests index cc826326de87..d2561895a021 100755 --- a/tools/testing/selftests/vm/run_vmtests +++ b/tools/testing/selftests/vm/run_vmtests @@ -177,4 +177,15 @@ else echo "[PASS]" fi +echo "-----------------------------" +echo "running virtual address 128TB switch test" +echo "-----------------------------" +./va_128TBswitch +if [ $? -ne 0 ]; then + echo "[FAIL]" + exitcode=1 +else + echo "[PASS]" +fi + exit $exitcode diff --git a/tools/testing/selftests/vm/va_128TBswitch.c b/tools/testing/selftests/vm/va_128TBswitch.c new file mode 100644 index 000000000000..e7fe734c374f --- /dev/null +++ b/tools/testing/selftests/vm/va_128TBswitch.c @@ -0,0 +1,297 @@ +/* + * + * Authors: Kirill A. Shutemov + * Authors: Aneesh Kumar K.V + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License, version 2, as + * published by the Free Software Foundation. + + * This program is distributed in the hope that it would be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + * + */ + +#include +#include +#include + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) + +#ifdef __powerpc64__ +#define PAGE_SIZE (64 << 10) +/* + * This will work with 16M and 2M hugepage size + */ +#define HUGETLB_SIZE (16 << 20) +#else +#define PAGE_SIZE (4 << 10) +#define HUGETLB_SIZE (2 << 20) +#endif + +/* + * >= 128TB is the hint addr value we used to select + * large address space. + */ +#define ADDR_SWITCH_HINT (1UL << 47) +#define LOW_ADDR ((void *) (1UL << 30)) +#define HIGH_ADDR ((void *) (1UL << 48)) + +struct testcase { + void *addr; + unsigned long size; + unsigned long flags; + const char *msg; + unsigned int low_addr_required:1; + unsigned int keep_mapped:1; +}; + +static struct testcase testcases[] = { + { + /* + * If stack is moved, we could possibly allocate + * this at the requested address. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + /* + * We should never allocate at the requested address or above it + * The len cross the 128TB boundary. Without MAP_FIXED + * we will always search in the lower address space. + */ + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, (2 * PAGE_SIZE))", + .low_addr_required = 1, + }, + { + /* + * Exact mapping at 128TB, the area is free we should get that + * even without MAP_FIXED. + */ + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, + { + .addr = NULL, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED)", + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1) again", + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT - PAGE_SIZE)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, PAGE_SIZE)", + .low_addr_required = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE / 2), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE/2 , 2 * PAGE_SIZE)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = ((void *)(ADDR_SWITCH_HINT)), + .size = PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT, PAGE_SIZE)", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * PAGE_SIZE, + .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT, 2 * PAGE_SIZE, MAP_FIXED)", + }, +}; + +static struct testcase hugetlb_testcases[] = { + { + .addr = NULL, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(NULL, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = LOW_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", + .low_addr_required = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", + .keep_mapped = 1, + }, + { + .addr = HIGH_ADDR, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB)", + .keep_mapped = 1, + }, + { + .addr = (void *) -1, + .size = HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(-1, MAP_HUGETLB) again", + }, + { + .addr = (void *)(ADDR_SWITCH_HINT - PAGE_SIZE), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, + .msg = "mmap(ADDR_SWITCH_HINT - PAGE_SIZE, 2*HUGETLB_SIZE, MAP_HUGETLB)", + .low_addr_required = 1, + .keep_mapped = 1, + }, + { + .addr = (void *)(ADDR_SWITCH_HINT), + .size = 2 * HUGETLB_SIZE, + .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, + .msg = "mmap(ADDR_SWITCH_HINT , 2*HUGETLB_SIZE, MAP_FIXED | MAP_HUGETLB)", + }, +}; + +static int run_test(struct testcase *test, int count) +{ + void *p; + int i, ret = 0; + + for (i = 0; i < count; i++) { + struct testcase *t = test + i; + + p = mmap(t->addr, t->size, PROT_READ | PROT_WRITE, t->flags, -1, 0); + + printf("%s: %p - ", t->msg, p); + + if (p == MAP_FAILED) { + printf("FAILED\n"); + ret = 1; + continue; + } + + if (t->low_addr_required && p >= (void *)(ADDR_SWITCH_HINT)) { + printf("FAILED\n"); + ret = 1; + } else { + /* + * Do a dereference of the address returned so that we catch + * bugs in page fault handling + */ + memset(p, 0, t->size); + printf("OK\n"); + } + if (!t->keep_mapped) + munmap(p, t->size); + } + + return ret; +} + +static int supported_arch(void) +{ +#if defined(__powerpc64__) + return 1; +#elif defined(__x86_64__) + return 1; +#else + return 0; +#endif +} + +int main(int argc, char **argv) +{ + int ret; + + if (!supported_arch()) + return 0; + + ret = run_test(testcases, ARRAY_SIZE(testcases)); + if (argc == 2 && !strcmp(argv[1], "--run-hugetlb")) + ret = run_test(hugetlb_testcases, ARRAY_SIZE(hugetlb_testcases)); + return ret; +} diff --git a/tools/testing/selftests/x86/5lvl.c b/tools/testing/selftests/x86/5lvl.c deleted file mode 100644 index 2eafdcd4c2b3..000000000000 --- a/tools/testing/selftests/x86/5lvl.c +++ /dev/null @@ -1,177 +0,0 @@ -#include -#include - -#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof((arr)[0])) - -#define PAGE_SIZE 4096 -#define LOW_ADDR ((void *) (1UL << 30)) -#define HIGH_ADDR ((void *) (1UL << 50)) - -struct testcase { - void *addr; - unsigned long size; - unsigned long flags; - const char *msg; - unsigned int low_addr_required:1; - unsigned int keep_mapped:1; -}; - -static struct testcase testcases[] = { - { - .addr = NULL, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED)", - }, - { - .addr = (void*) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1)", - .keep_mapped = 1, - }, - { - .addr = (void*) -1, - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1) again", - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 2 * PAGE_SIZE)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE / 2), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 2 * PAGE_SIZE / 2)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 2 * PAGE_SIZE, - .flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap((1UL << 47) - PAGE_SIZE, 2 * PAGE_SIZE, MAP_FIXED)", - }, - { - .addr = NULL, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(NULL, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = LOW_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(LOW_ADDR, MAP_HUGETLB)", - .low_addr_required = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(HIGH_ADDR, MAP_HUGETLB) again", - .keep_mapped = 1, - }, - { - .addr = HIGH_ADDR, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap(HIGH_ADDR, MAP_FIXED | MAP_HUGETLB)", - }, - { - .addr = (void*) -1, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB)", - .keep_mapped = 1, - }, - { - .addr = (void*) -1, - .size = 2UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap(-1, MAP_HUGETLB) again", - }, - { - .addr = (void *)((1UL << 47) - PAGE_SIZE), - .size = 4UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS, - .msg = "mmap((1UL << 47), 4UL << 20, MAP_HUGETLB)", - .low_addr_required = 1, - .keep_mapped = 1, - }, - { - .addr = (void *)((1UL << 47) - (2UL << 20)), - .size = 4UL << 20, - .flags = MAP_HUGETLB | MAP_PRIVATE | MAP_ANONYMOUS | MAP_FIXED, - .msg = "mmap((1UL << 47) - (2UL << 20), 4UL << 20, MAP_FIXED | MAP_HUGETLB)", - }, -}; - -int main(int argc, char **argv) -{ - int i; - void *p; - - for (i = 0; i < ARRAY_SIZE(testcases); i++) { - struct testcase *t = testcases + i; - - p = mmap(t->addr, t->size, PROT_NONE, t->flags, -1, 0); - - printf("%s: %p - ", t->msg, p); - - if (p == MAP_FAILED) { - printf("FAILED\n"); - continue; - } - - if (t->low_addr_required && p >= (void *)(1UL << 47)) - printf("FAILED\n"); - else - printf("OK\n"); - if (!t->keep_mapped) - munmap(p, t->size); - } - return 0; -} From e025f059a32085d76768e46eac344cba203a6a71 Mon Sep 17 00:00:00 2001 From: Vasyl Gomonovych Date: Wed, 31 Jan 2018 16:17:03 -0800 Subject: [PATCH 048/118] mm/interval_tree.c: use vma_pages() helper Use vma_pages function on vma object instead of explicit computation. mm/interval_tree.c:21:27-33: WARNING: Consider using vma_pages helper Generated by: scripts/coccinelle/api/vma_pages.cocci Link: http://lkml.kernel.org/r/1511364410-13499-1-git-send-email-gomonovych@gmail.com Signed-off-by: Vasyl Gomonovych Acked-by: Michael S. Tsirkin Acked-by: Davidlohr Bueso Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/interval_tree.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/interval_tree.c b/mm/interval_tree.c index b47664358796..27ddfd29112a 100644 --- a/mm/interval_tree.c +++ b/mm/interval_tree.c @@ -18,7 +18,7 @@ static inline unsigned long vma_start_pgoff(struct vm_area_struct *v) static inline unsigned long vma_last_pgoff(struct vm_area_struct *v) { - return v->vm_pgoff + ((v->vm_end - v->vm_start) >> PAGE_SHIFT) - 1; + return v->vm_pgoff + vma_pages(v) - 1; } INTERVAL_TREE_DEFINE(struct vm_area_struct, shared.rb, From a4ef87684108e5fef38cf289ee360f9b87a53cfd Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 31 Jan 2018 16:17:06 -0800 Subject: [PATCH 049/118] mm: remove unused pgdat_reclaimable_pages() Remove unused function pgdat_reclaimable_pages() and node_page_state_snapshot() which becomes unused as well. Link: http://lkml.kernel.org/r/20171122094416.26019-1-jack@suse.cz Signed-off-by: Jan Kara Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/swap.h | 1 - include/linux/vmstat.h | 17 ----------------- mm/vmscan.c | 16 ---------------- 3 files changed, 34 deletions(-) diff --git a/include/linux/swap.h b/include/linux/swap.h index 0bd4c25016f9..7b6a59f722a3 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -344,7 +344,6 @@ extern void lru_cache_add_active_or_unevictable(struct page *page, /* linux/mm/vmscan.c */ extern unsigned long zone_reclaimable_pages(struct zone *zone); -extern unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat); extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, gfp_t gfp_mask, nodemask_t *mask); extern int __isolate_lru_page(struct page *page, isolate_mode_t mode); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 1779c9817b39..a4c2317d8b9f 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -216,23 +216,6 @@ static inline unsigned long zone_page_state_snapshot(struct zone *zone, return x; } -static inline unsigned long node_page_state_snapshot(pg_data_t *pgdat, - enum node_stat_item item) -{ - long x = atomic_long_read(&pgdat->vm_stat[item]); - -#ifdef CONFIG_SMP - int cpu; - for_each_online_cpu(cpu) - x += per_cpu_ptr(pgdat->per_cpu_nodestats, cpu)->vm_node_stat_diff[item]; - - if (x < 0) - x = 0; -#endif - return x; -} - - #ifdef CONFIG_NUMA extern void __inc_numa_state(struct zone *zone, enum numa_stat_item item); extern unsigned long sum_zone_node_page_state(int node, diff --git a/mm/vmscan.c b/mm/vmscan.c index 153e0795f4f0..1a33c8e1e758 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -220,22 +220,6 @@ unsigned long zone_reclaimable_pages(struct zone *zone) return nr; } -unsigned long pgdat_reclaimable_pages(struct pglist_data *pgdat) -{ - unsigned long nr; - - nr = node_page_state_snapshot(pgdat, NR_ACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_INACTIVE_FILE) + - node_page_state_snapshot(pgdat, NR_ISOLATED_FILE); - - if (get_nr_swap_pages() > 0) - nr += node_page_state_snapshot(pgdat, NR_ACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_INACTIVE_ANON) + - node_page_state_snapshot(pgdat, NR_ISOLATED_ANON); - - return nr; -} - /** * lruvec_lru_size - Returns the number of pages on the given LRU list. * @lruvec: lru vector From d6cb41cc44c63492702281b1d329955ca767d399 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:17:10 -0800 Subject: [PATCH 050/118] mm, hugetlb: remove hugepages_treat_as_movable sysctl hugepages_treat_as_movable has been introduced by 396faf0303d2 ("Allow huge page allocations to use GFP_HIGH_MOVABLE") to allow hugetlb allocations from ZONE_MOVABLE even when hugetlb pages were not migrateable. The purpose of the movable zone was different at the time. It aimed at reducing memory fragmentation and hugetlb pages being long lived and large werre not contributing to the fragmentation so it was acceptable to use the zone back then. Things have changed though and the primary purpose of the zone became migratability guarantee. If we allow non migrateable hugetlb pages to be in ZONE_MOVABLE memory hotplug might fail to offline the memory. Remove the knob and only rely on hugepage_migration_supported to allow movable zones. Mel said: : Primarily it was aimed at allowing the hugetlb pool to safely shrink with : the ability to grow it again. The use case was for batched jobs, some of : which needed huge pages and others that did not but didn't want the memory : useless pinned in the huge pages pool. : : I suspect that more users rely on THP than hugetlbfs for flexible use of : huge pages with fallback options so I think that removing the option : should be ok. Link: http://lkml.kernel.org/r/20171003072619.8654-1-mhocko@kernel.org Signed-off-by: Michal Hocko Reported-by: Alexandru Moise <00moses.alexander00@gmail.com> Acked-by: Mel Gorman Cc: Alexandru Moise <00moses.alexander00@gmail.com> Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/sysctl/vm.txt | 25 ------------------------- include/linux/hugetlb.h | 1 - kernel/sysctl.c | 7 ------- mm/hugetlb.c | 4 +--- 4 files changed, 1 insertion(+), 36 deletions(-) diff --git a/Documentation/sysctl/vm.txt b/Documentation/sysctl/vm.txt index 5025ff9307e6..ff234d229cbb 100644 --- a/Documentation/sysctl/vm.txt +++ b/Documentation/sysctl/vm.txt @@ -30,7 +30,6 @@ Currently, these files are in /proc/sys/vm: - dirty_writeback_centisecs - drop_caches - extfrag_threshold -- hugepages_treat_as_movable - hugetlb_shm_group - laptop_mode - legacy_va_layout @@ -261,30 +260,6 @@ any throttling. ============================================================== -hugepages_treat_as_movable - -This parameter controls whether we can allocate hugepages from ZONE_MOVABLE -or not. If set to non-zero, hugepages can be allocated from ZONE_MOVABLE. -ZONE_MOVABLE is created when kernel boot parameter kernelcore= is specified, -so this parameter has no effect if used without kernelcore=. - -Hugepage migration is now available in some situations which depend on the -architecture and/or the hugepage size. If a hugepage supports migration, -allocation from ZONE_MOVABLE is always enabled for the hugepage regardless -of the value of this parameter. -IOW, this parameter affects only non-migratable hugepages. - -Assuming that hugepages are not migratable in your system, one usecase of -this parameter is that users can make hugepage pool more extensible by -enabling the allocation from ZONE_MOVABLE. This is because on ZONE_MOVABLE -page reclaim/migration/compaction work more and you can get contiguous -memory more likely. Note that using ZONE_MOVABLE for non-migratable -hugepages can do harm to other features like memory hotremove (because -memory hotremove expects that memory blocks on ZONE_MOVABLE are always -removable,) so it's a trade-off responsible for the users. - -============================================================== - hugetlb_shm_group hugetlb_shm_group contains group id that is allowed to create SysV diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 82a25880714a..6fcf140188d0 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -129,7 +129,6 @@ u32 hugetlb_fault_mutex_hash(struct hstate *h, struct mm_struct *mm, pte_t *huge_pmd_share(struct mm_struct *mm, unsigned long addr, pud_t *pud); -extern int hugepages_treat_as_movable; extern int sysctl_hugetlb_shm_group; extern struct list_head huge_boot_pages; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 557d46728577..2fb4e27c636a 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1374,13 +1374,6 @@ static struct ctl_table vm_table[] = { .mode = 0644, .proc_handler = proc_dointvec, }, - { - .procname = "hugepages_treat_as_movable", - .data = &hugepages_treat_as_movable, - .maxlen = sizeof(int), - .mode = 0644, - .proc_handler = proc_dointvec, - }, { .procname = "nr_overcommit_hugepages", .data = NULL, diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 1e6a5ad0d420..4137fb67cd79 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -36,8 +36,6 @@ #include #include "internal.h" -int hugepages_treat_as_movable; - int hugetlb_max_hstate __read_mostly; unsigned int default_hstate_idx; struct hstate hstates[HUGE_MAX_HSTATE]; @@ -926,7 +924,7 @@ retry_cpuset: /* Movability of hugepages depends on migration support. */ static inline gfp_t htlb_alloc_mask(struct hstate *h) { - if (hugepages_treat_as_movable || hugepage_migration_supported(h)) + if (hugepage_migration_supported(h)) return GFP_HIGHUSER_MOVABLE; else return GFP_HIGHUSER; From dc88c88904b8c5eb749874aecc278146b6ae02f3 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:17:14 -0800 Subject: [PATCH 051/118] mm/memory_hotplug.c: remove unnecesary check from register_page_bootmem_info_section() When we call register_page_bootmem_info_section() having CONFIG_SPARSEMEM_VMEMMAP enabled, we check if the pfn is valid. This check is redundant as we already checked this in register_page_bootmem_info_node() before calling register_page_bootmem_info_section(), so let's get rid of it. Link: http://lkml.kernel.org/r/20171205143422.GA31458@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 3 --- 1 file changed, 3 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 999ce3af809d..9646e5d63648 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -200,9 +200,6 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) struct mem_section *ms; struct page *page, *memmap; - if (!pfn_valid(start_pfn)) - return; - section_nr = pfn_to_section_nr(start_pfn); ms = __nr_to_section(section_nr); From ef549e13cf62733097eb1f7a9f44b2cea1611007 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Wed, 31 Jan 2018 16:17:17 -0800 Subject: [PATCH 052/118] mm: update comment describing tlb_gather_mmu The comment describes @fullmm argument, but the function has no such parameter. Update the comment to match the code and convert it to kernel-doc markup. Link: http://lkml.kernel.org/r/1512394531-2264-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 793004608332..82a0577933aa 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -400,10 +400,17 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table) #endif /* CONFIG_HAVE_RCU_TABLE_FREE */ -/* tlb_gather_mmu - * Called to initialize an (on-stack) mmu_gather structure for page-table - * tear-down from @mm. The @fullmm argument is used when @mm is without - * users and we're going to destroy the full address space (exit/execve). +/** + * tlb_gather_mmu - initialize an mmu_gather structure for page-table tear-down + * @tlb: the mmu_gather structure to initialize + * @mm: the mm_struct of the target address space + * @start: start of the region that will be removed from the page-table + * @end: end of the region that will be removed from the page-table + * + * Called to initialize an (on-stack) mmu_gather structure for page-table + * tear-down from @mm. The @start and @end are set to 0 and -1 + * respectively when @mm is without users and we're going to destroy + * the full address space (exit/execve). */ void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm, unsigned long start, unsigned long end) From 8526d84f81710c77ead9a7bfe82b66a241f1aed1 Mon Sep 17 00:00:00 2001 From: Konstantin Khlebnikov Date: Wed, 31 Jan 2018 16:17:22 -0800 Subject: [PATCH 053/118] fs/proc/task_mmu.c: do not show VmExe bigger than total executable virtual memory If start_code / end_code pointers are screwed then "VmExe" could be bigger than total executable virtual memory and "VmLib" becomes negative: VmExe: 294320 kB VmLib: 18446744073709327564 kB VmExe and VmLib documented as text segment and shared library code size. Now their sum will be always equal to mm->exec_vm which sums size of executable and not writable and not stack areas. I've seen this for huge (>2Gb) statically linked binary which has whole world inside. For it start_code .. end_code range also covers one of rodata sections. Probably this is bug in customized linker, elf loader or both. Anyway CONFIG_CHECKPOINT_RESTORE allows to change these pointers, thus we cannot trust them without validation. Link: http://lkml.kernel.org/r/150728955451.743749.11276392315459539583.stgit@buzz Signed-off-by: Konstantin Khlebnikov Acked-by: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 339e4c1c044d..4691f5aca00e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -47,8 +47,11 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) if (hiwater_rss < mm->hiwater_rss) hiwater_rss = mm->hiwater_rss; - text = (PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK)) >> 10; - lib = (mm->exec_vm << (PAGE_SHIFT-10)) - text; + /* split executable areas between text and lib */ + text = PAGE_ALIGN(mm->end_code) - (mm->start_code & PAGE_MASK); + text = min(text, mm->exec_vm << PAGE_SHIFT); + lib = (mm->exec_vm << PAGE_SHIFT) - text; + swap = get_mm_counter(mm, MM_SWAPENTS); seq_printf(m, "VmPeak:\t%8lu kB\n" @@ -76,7 +79,9 @@ void task_mem(struct seq_file *m, struct mm_struct *mm) file << (PAGE_SHIFT-10), shmem << (PAGE_SHIFT-10), mm->data_vm << (PAGE_SHIFT-10), - mm->stack_vm << (PAGE_SHIFT-10), text, lib, + mm->stack_vm << (PAGE_SHIFT-10), + text >> 10, + lib >> 10, mm_pgtables_bytes(mm) >> 10, swap << (PAGE_SHIFT-10)); hugetlb_report_usage(m, mm); From 9ac9322d7cfa35b5381a08c7eaed56eb2297377e Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:17:25 -0800 Subject: [PATCH 054/118] mm: memory_hotplug: remove second __nr_to_section in register_page_bootmem_info_section() In register_page_bootmem_info_section() we call __nr_to_section() in order to get the mem_section struct at the beginning of the function. Since we already got it, there is no need for a second call to __nr_to_section(). Link: http://lkml.kernel.org/r/20171207102914.GA12396@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory_hotplug.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 9646e5d63648..9bbd6982d4e4 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -184,7 +184,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) for (i = 0; i < mapsize; i++, page++) get_page_bootmem(section_nr, page, SECTION_INFO); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; @@ -207,7 +207,7 @@ static void register_page_bootmem_info_section(unsigned long start_pfn) register_page_bootmem_memmap(section_nr, memmap, PAGES_PER_SECTION); - usemap = __nr_to_section(section_nr)->pageblock_flags; + usemap = ms->pageblock_flags; page = virt_to_page(usemap); mapsize = PAGE_ALIGN(usemap_size()) >> PAGE_SHIFT; From 9bebc09fcf4fb25e36cf86af764c038b92f64057 Mon Sep 17 00:00:00 2001 From: Yisheng Xie Date: Wed, 31 Jan 2018 16:17:29 -0800 Subject: [PATCH 055/118] mm/huge_memory.c: fix comment in __split_huge_pmd_locked pmd_trans_splitting() was removed after THP refcounting redesign, therefore related comment should be updated. Link: http://lkml.kernel.org/r/1512625745-59451-1-git-send-email-xieyisheng1@huawei.com Signed-off-by: Yisheng Xie Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0e7ded98d114..0d3ae51ce4f7 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2205,10 +2205,9 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * for the same virtual address to be loaded simultaneously. So instead * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the * current pmd notpresent (atomically because here the pmd_trans_huge - * and pmd_trans_splitting must remain set at all times on the pmd - * until the split is complete for this pmd), then we flush the SMP TLB - * and finally we write the non-huge version of the pmd entry with - * pmd_populate. + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. */ pmdp_invalidate(vma, haddr, pmd); pmd_populate(mm, pmd, pgtable); From a365ac09d334389bc69841c9d153f03fa2442f1c Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Wed, 31 Jan 2018 16:17:32 -0800 Subject: [PATCH 056/118] mm, userfaultfd, THP: avoid waiting when PMD under THP migration If THP migration is enabled, for a VMA handled by userfaultfd, consider the following situation, do_page_fault() __do_huge_pmd_anonymous_page() handle_userfault() userfault_msg() /* a huge page is allocated and mapped at fault address */ /* the huge page is under migration, leaves migration entry in page table */ userfaultfd_must_wait() /* return true because !pmd_present() */ /* may wait in loop until fatal signal */ That is, it may be possible for userfaultfd_must_wait() encounters a PMD entry which is !pmd_none() && !pmd_present(). In the current implementation, we will wait for such PMD entries, which may cause unnecessary waiting, and potential soft lockup. This is fixed via avoiding to wait when !pmd_none() && !pmd_present(), only wait when pmd_none(). This may be not a problem in practice, because userfaultfd_must_wait() is always called with mm->mmap_sem read-locked. mremap() will write-lock mm->mmap_sem. And UFFDIO_COPY doesn't support to copy THP mapping. But the change introduced still makes the code more correct, and makes the PMD and PTE code more consistent. Link: http://lkml.kernel.org/r/20171207011752.3292-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Andrea Arcangeli Cc: Mike Kravetz Cc: Mike Rapoport Cc: "Kirill A. Shutemov" Cc: Alexander Viro Cc: Zi Yan Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 743eaa646898..a9d0ddc12ace 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -294,10 +294,13 @@ static inline bool userfaultfd_must_wait(struct userfaultfd_ctx *ctx, * pmd_trans_unstable) of the pmd. */ _pmd = READ_ONCE(*pmd); - if (!pmd_present(_pmd)) + if (pmd_none(_pmd)) goto out; ret = false; + if (!pmd_present(_pmd)) + goto out; + if (pmd_trans_huge(_pmd)) goto out; From 977fbdcd5986c9ff700bf276644d2b1973a53348 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:17:36 -0800 Subject: [PATCH 057/118] mm: add unmap_mapping_pages() Several users of unmap_mapping_range() would prefer to express their range in pages rather than bytes. Unfortuately, on a 32-bit kernel, you have to remember to cast your page number to a 64-bit type before shifting it, and four places in the current tree didn't remember to do that. That's a sign of a bad interface. Conveniently, unmap_mapping_range() actually converts from bytes into pages, so hoist the guts of unmap_mapping_range() into a new function unmap_mapping_pages() and convert the callers which want to use pages. Link: http://lkml.kernel.org/r/20171206142627.GD32044@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reported-by: "zhangyi (F)" Reviewed-by: Ross Zwisler Acked-by: Kirill A. Shutemov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 19 ++++++------------- include/linux/mm.h | 26 ++++++++++++++++---------- mm/khugepaged.c | 3 +-- mm/memory.c | 43 +++++++++++++++++++++++++++++++------------ mm/nommu.c | 7 ------- mm/truncate.c | 23 +++++++---------------- 6 files changed, 61 insertions(+), 60 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index c2ebf10b70da..6ee6f7e24f5a 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -44,6 +44,7 @@ /* The 'colour' (ie low bits) within a PMD of a page offset. */ #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) +#define PG_PMD_NR (PMD_SIZE >> PAGE_SHIFT) static wait_queue_head_t wait_table[DAX_WAIT_TABLE_ENTRIES]; @@ -375,8 +376,8 @@ restart: * unmapped. */ if (pmd_downgrade && dax_is_zero_entry(entry)) - unmap_mapping_range(mapping, - (index << PAGE_SHIFT) & PMD_MASK, PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); err = radix_tree_preload( mapping_gfp_mask(mapping) & ~__GFP_HIGHMEM); @@ -538,12 +539,10 @@ static void *dax_insert_mapping_entry(struct address_space *mapping, if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { /* we are replacing a zero page with block mapping */ if (dax_is_pmd_entry(entry)) - unmap_mapping_range(mapping, - (vmf->pgoff << PAGE_SHIFT) & PMD_MASK, - PMD_SIZE, 0); + unmap_mapping_pages(mapping, index & ~PG_PMD_COLOUR, + PG_PMD_NR, false); else /* pte entry */ - unmap_mapping_range(mapping, vmf->pgoff << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, vmf->pgoff, 1, false); } spin_lock_irq(&mapping->tree_lock); @@ -1269,12 +1268,6 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } #ifdef CONFIG_FS_DAX_PMD -/* - * The 'colour' (ie low bits) within a PMD of a page offset. This comes up - * more often than one might expect in the below functions. - */ -#define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1) - static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { diff --git a/include/linux/mm.h b/include/linux/mm.h index 7fc92384977e..173d2484f6e3 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1312,8 +1312,6 @@ void free_pgd_range(struct mmu_gather *tlb, unsigned long addr, unsigned long end, unsigned long floor, unsigned long ceiling); int copy_page_range(struct mm_struct *dst, struct mm_struct *src, struct vm_area_struct *vma); -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, int even_cows); int follow_pte_pmd(struct mm_struct *mm, unsigned long address, unsigned long *start, unsigned long *end, pte_t **ptepp, pmd_t **pmdpp, spinlock_t **ptlp); @@ -1324,12 +1322,6 @@ int follow_phys(struct vm_area_struct *vma, unsigned long address, int generic_access_phys(struct vm_area_struct *vma, unsigned long addr, void *buf, int len, int write); -static inline void unmap_shared_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen) -{ - unmap_mapping_range(mapping, holebegin, holelen, 0); -} - extern void truncate_pagecache(struct inode *inode, loff_t new); extern void truncate_setsize(struct inode *inode, loff_t newsize); void pagecache_isize_extended(struct inode *inode, loff_t from, loff_t to); @@ -1344,6 +1336,10 @@ extern int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, extern int fixup_user_fault(struct task_struct *tsk, struct mm_struct *mm, unsigned long address, unsigned int fault_flags, bool *unlocked); +void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows); +void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows); #else static inline int handle_mm_fault(struct vm_area_struct *vma, unsigned long address, unsigned int flags) @@ -1360,10 +1356,20 @@ static inline int fixup_user_fault(struct task_struct *tsk, BUG(); return -EFAULT; } +static inline void unmap_mapping_pages(struct address_space *mapping, + pgoff_t start, pgoff_t nr, bool even_cows) { } +static inline void unmap_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen, int even_cows) { } #endif -extern int access_process_vm(struct task_struct *tsk, unsigned long addr, void *buf, int len, - unsigned int gup_flags); +static inline void unmap_shared_mapping_range(struct address_space *mapping, + loff_t const holebegin, loff_t const holelen) +{ + unmap_mapping_range(mapping, holebegin, holelen, 0); +} + +extern int access_process_vm(struct task_struct *tsk, unsigned long addr, + void *buf, int len, unsigned int gup_flags); extern int access_remote_vm(struct mm_struct *mm, unsigned long addr, void *buf, int len, unsigned int gup_flags); extern int __access_remote_vm(struct task_struct *tsk, struct mm_struct *mm, diff --git a/mm/khugepaged.c b/mm/khugepaged.c index ea4ff259b671..1cd18e4347fe 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1399,8 +1399,7 @@ static void collapse_shmem(struct mm_struct *mm, } if (page_mapped(page)) - unmap_mapping_range(mapping, index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, 1, false); spin_lock_irq(&mapping->tree_lock); diff --git a/mm/memory.c b/mm/memory.c index 82a0577933aa..a6e5d6ac5d24 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -2798,9 +2798,38 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, } } +/** + * unmap_mapping_pages() - Unmap pages from processes. + * @mapping: The address space containing pages to be unmapped. + * @start: Index of first page to be unmapped. + * @nr: Number of pages to be unmapped. 0 to unmap to end of file. + * @even_cows: Whether to unmap even private COWed pages. + * + * Unmap the pages in this address space from any userspace process which + * has them mmaped. Generally, you want to remove COWed pages as well when + * a file is being truncated, but not when invalidating pages from the page + * cache. + */ +void unmap_mapping_pages(struct address_space *mapping, pgoff_t start, + pgoff_t nr, bool even_cows) +{ + struct zap_details details = { }; + + details.check_mapping = even_cows ? NULL : mapping; + details.first_index = start; + details.last_index = start + nr - 1; + if (details.last_index < details.first_index) + details.last_index = ULONG_MAX; + + i_mmap_lock_write(mapping); + if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) + unmap_mapping_range_tree(&mapping->i_mmap, &details); + i_mmap_unlock_write(mapping); +} + /** * unmap_mapping_range - unmap the portion of all mmaps in the specified - * address_space corresponding to the specified page range in the underlying + * address_space corresponding to the specified byte range in the underlying * file. * * @mapping: the address space containing mmaps to be unmapped. @@ -2818,7 +2847,6 @@ static inline void unmap_mapping_range_tree(struct rb_root_cached *root, void unmap_mapping_range(struct address_space *mapping, loff_t const holebegin, loff_t const holelen, int even_cows) { - struct zap_details details = { }; pgoff_t hba = holebegin >> PAGE_SHIFT; pgoff_t hlen = (holelen + PAGE_SIZE - 1) >> PAGE_SHIFT; @@ -2830,16 +2858,7 @@ void unmap_mapping_range(struct address_space *mapping, hlen = ULONG_MAX - hba + 1; } - details.check_mapping = even_cows ? NULL : mapping; - details.first_index = hba; - details.last_index = hba + hlen - 1; - if (details.last_index < details.first_index) - details.last_index = ULONG_MAX; - - i_mmap_lock_write(mapping); - if (unlikely(!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root))) - unmap_mapping_range_tree(&mapping->i_mmap, &details); - i_mmap_unlock_write(mapping); + unmap_mapping_pages(mapping, hba, hlen, even_cows); } EXPORT_SYMBOL(unmap_mapping_range); diff --git a/mm/nommu.c b/mm/nommu.c index 17c00d93de2e..4b9864b17cb0 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1788,13 +1788,6 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -void unmap_mapping_range(struct address_space *mapping, - loff_t const holebegin, loff_t const holelen, - int even_cows) -{ -} -EXPORT_SYMBOL(unmap_mapping_range); - int filemap_fault(struct vm_fault *vmf) { BUG(); diff --git a/mm/truncate.c b/mm/truncate.c index e4b4cf0f4070..c34e2fd4f583 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -179,12 +179,8 @@ static void truncate_cleanup_page(struct address_space *mapping, struct page *page) { if (page_mapped(page)) { - loff_t holelen; - - holelen = PageTransHuge(page) ? HPAGE_PMD_SIZE : PAGE_SIZE; - unmap_mapping_range(mapping, - (loff_t)page->index << PAGE_SHIFT, - holelen, 0); + pgoff_t nr = PageTransHuge(page) ? HPAGE_PMD_NR : 1; + unmap_mapping_pages(mapping, page->index, nr, false); } if (page_has_private(page)) @@ -715,19 +711,15 @@ int invalidate_inode_pages2_range(struct address_space *mapping, /* * Zap the rest of the file in one hit. */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - (loff_t)(1 + end - index) - << PAGE_SHIFT, - 0); + unmap_mapping_pages(mapping, index, + (1 + end - index), false); did_range_unmap = 1; } else { /* * Just zap this page */ - unmap_mapping_range(mapping, - (loff_t)index << PAGE_SHIFT, - PAGE_SIZE, 0); + unmap_mapping_pages(mapping, index, + 1, false); } } BUG_ON(page_mapped(page)); @@ -753,8 +745,7 @@ int invalidate_inode_pages2_range(struct address_space *mapping, * get remapped later. */ if (dax_mapping(mapping)) { - unmap_mapping_range(mapping, (loff_t)start << PAGE_SHIFT, - (loff_t)(end - start + 1) << PAGE_SHIFT, 0); + unmap_mapping_pages(mapping, start, end - start + 1, false); } out: cleancache_invalidate_inode(mapping); From 146500e9604cece72d4bed1cd15fac789220c795 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:17:40 -0800 Subject: [PATCH 058/118] mm: get 7% more pages in a pagevec We don't have to use an entire 'long' for the number of elements in the pagevec; we know it's a number between 0 and 14 (now 15). So we can store it in a char, and then the bool packs next to it and we still have two or six bytes of padding for more elements in the header. That gives us space to cram in an extra page. Link: http://lkml.kernel.org/r/20171206022521.GM26021@bombadil.infradead.org Signed-off-by: Matthew Wilcox Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagevec.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/pagevec.h b/include/linux/pagevec.h index 5fb6580f7f23..6dc456ac6136 100644 --- a/include/linux/pagevec.h +++ b/include/linux/pagevec.h @@ -9,14 +9,14 @@ #ifndef _LINUX_PAGEVEC_H #define _LINUX_PAGEVEC_H -/* 14 pointers + two long's align the pagevec structure to a power of two */ -#define PAGEVEC_SIZE 14 +/* 15 pointers + header align the pagevec structure to a power of two */ +#define PAGEVEC_SIZE 15 struct page; struct address_space; struct pagevec { - unsigned long nr; + unsigned char nr; bool percpu_pvec_drained; struct page *pages[PAGEVEC_SIZE]; }; From c58f0bb77ed8bf93dfdde762b01cb67eebbdfc29 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:17:43 -0800 Subject: [PATCH 059/118] asm-generic: provide generic_pmdp_establish() Patch series "Do not lose dirty bit on THP pages", v4. Vlastimil noted that pmdp_invalidate() is not atomic and we can lose dirty and access bits if CPU sets them after pmdp dereference, but before set_pmd_at(). The bug can lead to data loss, but the race window is tiny and I haven't seen any reports that suggested that it happens in reality. So I don't think it worth sending it to stable. Unfortunately, there's no way to address the issue in a generic way. We need to fix all architectures that support THP one-by-one. All architectures that have THP supported have to provide atomic pmdp_invalidate() that returns previous value. If generic implementation of pmdp_invalidate() is used, architecture needs to provide atomic pmdp_estabish(). pmdp_estabish() is not used out-side generic implementation of pmdp_invalidate() so far, but I think this can change in the future. This patch (of 12): This is an implementation of pmdp_establish() that is only suitable for an architecture that doesn't have hardware dirty/accessed bits. In this case we can't race with CPU which sets these bits and non-atomic approach is fine. Link: http://lkml.kernel.org/r/20171213105756.69879-2-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 868e68561f91..118ca2eb7a32 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -309,6 +309,21 @@ extern void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); #endif +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* + * This is an implementation of pmdp_establish() that is only suitable for an + * architecture that doesn't have hardware dirty/accessed bits. In this case we + * can't race with CPU which sets these bits and non-atomic aproach is fine. + */ +static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old_pmd = *pmdp; + set_pmd_at(vma->vm_mm, address, pmdp, pmd); + return old_pmd; +} +#endif + #ifndef __HAVE_ARCH_PMDP_INVALIDATE extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); From 5c8aa7ea4f988f6759b49265b4ad1cdd058e8406 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:17:48 -0800 Subject: [PATCH 060/118] arc: use generic_pmdp_establish as pmdp_establish ARC doesn't support hardware dirty/accessed bits. generic_pmdp_establish() is suitable in this case. Link: http://lkml.kernel.org/r/20171213105756.69879-3-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Vineet Gupta Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arc/include/asm/hugepage.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arc/include/asm/hugepage.h b/arch/arc/include/asm/hugepage.h index b18fcb606908..dc8ee011882f 100644 --- a/arch/arc/include/asm/hugepage.h +++ b/arch/arc/include/asm/hugepage.h @@ -74,4 +74,7 @@ extern pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); extern void flush_pmd_tlb_range(struct vm_area_struct *vma, unsigned long start, unsigned long end); +/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/ +#define pmdp_establish generic_pmdp_establish + #endif From ef298cc567684e33d0c9f490e71884851b646d41 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:17:51 -0800 Subject: [PATCH 061/118] arm/mm: provide pmdp_establish() helper ARM LPAE doesn't have hardware dirty/accessed bits. generic_pmdp_establish() is the right implementation of pmdp_establish for this case. Link: http://lkml.kernel.org/r/20171213105756.69879-4-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Catalin Marinas Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/include/asm/pgtable-3level.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 1a7a17b2a1ba..2a4836087358 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -249,6 +249,9 @@ PMD_BIT_FUNC(mkyoung, |= PMD_SECT_AF); #define pfn_pmd(pfn,prot) (__pmd(((phys_addr_t)(pfn) << PAGE_SHIFT) | pgprot_val(prot))) #define mk_pmd(page,prot) pfn_pmd(page_to_pfn(page),prot) +/* No hardware dirty/accessed bits -- generic_pmdp_establish() fits */ +#define pmdp_establish generic_pmdp_establish + /* represent a notpresent pmd by faulting entry, this is used by pmdp_invalidate */ static inline pmd_t pmd_mknotpresent(pmd_t pmd) { From 1d78a62cb3bb2bd95d00149daaa144f1fe0a77df Mon Sep 17 00:00:00 2001 From: Catalin Marinas Date: Wed, 31 Jan 2018 16:17:55 -0800 Subject: [PATCH 062/118] arm64: provide pmdp_establish() helper We need an atomic way to setup pmd page table entry, avoiding races with CPU setting dirty/accessed bits. This is required to implement pmdp_invalidate() that doesn't lose these bits. Link: http://lkml.kernel.org/r/20171213105756.69879-5-kirill.shutemov@linux.intel.com Signed-off-by: Catalin Marinas Signed-off-by: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm64/include/asm/pgtable.h | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 89167c43ebb5..094374c82db0 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -706,6 +706,13 @@ static inline void pmdp_set_wrprotect(struct mm_struct *mm, { ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } + +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + return __pmd(xchg_relaxed(&pmd_val(*pmdp), pmd_val(pmd))); +} #endif extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; From b6b34b2dfb7bab7b76a08862fe034c3bb29ec20d Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:17:58 -0800 Subject: [PATCH 063/118] mips: use generic_pmdp_establish as pmdp_establish MIPS doesn't support hardware dirty/accessed bits. generic_pmdp_establish() is suitable in this case. Link: http://lkml.kernel.org/r/20171213105756.69879-6-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Ralf Baechle Cc: David Daney Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/mips/include/asm/pgtable.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/arch/mips/include/asm/pgtable.h b/arch/mips/include/asm/pgtable.h index 1a508a74d48d..129e0328367f 100644 --- a/arch/mips/include/asm/pgtable.h +++ b/arch/mips/include/asm/pgtable.h @@ -534,6 +534,9 @@ static inline int io_remap_pfn_range(struct vm_area_struct *vma, #ifdef CONFIG_TRANSPARENT_HUGEPAGE +/* We don't have hardware dirty/accessed bits, generic_pmdp_establish is fine.*/ +#define pmdp_establish generic_pmdp_establish + #define has_transparent_hugepage has_transparent_hugepage extern int has_transparent_hugepage(void); From 8cc931e03339eebbdbaa2ac1998d25a8a90b77d4 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 31 Jan 2018 16:18:02 -0800 Subject: [PATCH 064/118] powerpc/mm: update pmdp_invalidate to return old pmd value It's required to avoid losing dirty and accessed bits. Link: http://lkml.kernel.org/r/20171213105756.69879-7-kirill.shutemov@linux.intel.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/pgtable.h | 4 ++-- arch/powerpc/mm/pgtable-book3s64.c | 7 +++++-- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 44697817ccc6..ee19d5bbee06 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1137,8 +1137,8 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, } #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, - pmd_t *pmdp); +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, + pmd_t *pmdp); #define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, diff --git a/arch/powerpc/mm/pgtable-book3s64.c b/arch/powerpc/mm/pgtable-book3s64.c index 3b65917785a5..422e80253a33 100644 --- a/arch/powerpc/mm/pgtable-book3s64.c +++ b/arch/powerpc/mm/pgtable-book3s64.c @@ -90,16 +90,19 @@ void serialize_against_pte_lookup(struct mm_struct *mm) * We use this to invalidate a pmdp entry before switching from a * hugepte to regular pmd entry. */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); + unsigned long old_pmd; + + old_pmd = pmd_hugepage_update(vma->vm_mm, address, pmdp, _PAGE_PRESENT, 0); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* * This ensures that generic code that rely on IRQ disabling * to prevent a parallel THP split work as expected. */ serialize_against_pte_lookup(vma->vm_mm); + return __pmd(old_pmd); } static pmd_t pmd_set_protbits(pmd_t pmd, pgprot_t pgprot) From 9c4563f11fcd5c65efcd64db2b974bd9b1728eef Mon Sep 17 00:00:00 2001 From: Martin Schwidefsky Date: Wed, 31 Jan 2018 16:18:05 -0800 Subject: [PATCH 065/118] s390/mm: modify pmdp_invalidate to return old value. It's required to avoid losing dirty and accessed bits. Link: http://lkml.kernel.org/r/20171213105756.69879-8-kirill.shutemov@linux.intel.com Signed-off-by: Martin Schwidefsky Signed-off-by: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/include/asm/pgtable.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 0a6b0286c32e..2d24d33bf188 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -1505,12 +1505,12 @@ static inline pmd_t pmdp_huge_clear_flush(struct vm_area_struct *vma, } #define __HAVE_ARCH_PMDP_INVALIDATE -static inline void pmdp_invalidate(struct vm_area_struct *vma, +static inline pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); + return pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT From a8e654f01cb725d0bfd741ebca1bf4c9337969cc Mon Sep 17 00:00:00 2001 From: Nitin Gupta Date: Wed, 31 Jan 2018 16:18:09 -0800 Subject: [PATCH 066/118] sparc64: update pmdp_invalidate() to return old pmd value It's required to avoid losing dirty and accessed bits. [akpm@linux-foundation.org: add a `do' to the do-while loop] Link: http://lkml.kernel.org/r/20171213105756.69879-9-kirill.shutemov@linux.intel.com Signed-off-by: Nitin Gupta Signed-off-by: Kirill A. Shutemov Cc: David Miller Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/sparc/include/asm/pgtable_64.h | 2 +- arch/sparc/mm/tlb.c | 23 ++++++++++++++++++----- 2 files changed, 19 insertions(+), 6 deletions(-) diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 9937c5ff94a9..339920fdf9ed 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -1010,7 +1010,7 @@ void update_mmu_cache_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmd); #define __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #define __HAVE_ARCH_PGTABLE_DEPOSIT diff --git a/arch/sparc/mm/tlb.c b/arch/sparc/mm/tlb.c index 4ae86bc0d35c..847ddffbf38a 100644 --- a/arch/sparc/mm/tlb.c +++ b/arch/sparc/mm/tlb.c @@ -219,17 +219,28 @@ void set_pmd_at(struct mm_struct *mm, unsigned long addr, } } +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} + /* * This routine is only called when splitting a THP */ -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; + pmd_t old, entry; - pmd_val(entry) &= ~_PAGE_VALID; - - set_pmd_at(vma->vm_mm, address, pmdp, entry); + entry = __pmd(pmd_val(*pmdp) & ~_PAGE_VALID); + old = pmdp_establish(vma, address, pmdp, entry); flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); /* @@ -240,6 +251,8 @@ void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, if ((pmd_val(entry) & _PAGE_PMD_HUGE) && !is_huge_zero_page(pmd_page(entry))) (vma->vm_mm)->context.thp_pte_count--; + + return old; } void pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, From 86fa949b050184ffc53688516a6a83ae5f98d08a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:18:13 -0800 Subject: [PATCH 067/118] x86/mm: provide pmdp_establish() helper We need an atomic way to setup pmd page table entry, avoiding races with CPU setting dirty/accessed bits. This is required to implement pmdp_invalidate() that doesn't lose these bits. On PAE we can avoid expensive cmpxchg8b for cases when new page table entry is not present. If it's present, fallback to cpmxchg loop. [akpm@linux-foundation.org: add missing `do' to do-while loop] Link: http://lkml.kernel.org/r/20171213105756.69879-10-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Ingo Molnar Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Vlastimil Babka Cc: Andrea Arcangeli Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pgtable-3level.h | 37 ++++++++++++++++++++++++++- arch/x86/include/asm/pgtable.h | 15 +++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h index bc4af5453802..f24df59c40b2 100644 --- a/arch/x86/include/asm/pgtable-3level.h +++ b/arch/x86/include/asm/pgtable-3level.h @@ -158,7 +158,6 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) #endif -#ifdef CONFIG_SMP union split_pmd { struct { u32 pmd_low; @@ -166,6 +165,8 @@ union split_pmd { }; pmd_t pmd; }; + +#ifdef CONFIG_SMP static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) { union split_pmd res, *orig = (union split_pmd *)pmdp; @@ -181,6 +182,40 @@ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) #endif +#ifndef pmdp_establish +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + pmd_t old; + + /* + * If pmd has present bit cleared we can get away without expensive + * cmpxchg64: we can update pmdp half-by-half without racing with + * anybody. + */ + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { + union split_pmd old, new, *ptr; + + ptr = (union split_pmd *)pmdp; + + new.pmd = pmd; + + /* xchg acts as a barrier before setting of the high bits */ + old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); + old.pmd_high = ptr->pmd_high; + ptr->pmd_high = new.pmd_high; + return old.pmd; + } + + do { + old = *pmdp; + } while (cmpxchg64(&pmdp->pmd, old.pmd, pmd.pmd) != old.pmd); + + return old; +} +#endif + #ifdef CONFIG_SMP union split_pud { struct { diff --git a/arch/x86/include/asm/pgtable.h b/arch/x86/include/asm/pgtable.h index e42b8943cb1a..63c2552b6b65 100644 --- a/arch/x86/include/asm/pgtable.h +++ b/arch/x86/include/asm/pgtable.h @@ -1109,6 +1109,21 @@ static inline int pud_write(pud_t pud) return pud_flags(pud) & _PAGE_RW; } +#ifndef pmdp_establish +#define pmdp_establish pmdp_establish +static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + unsigned long address, pmd_t *pmdp, pmd_t pmd) +{ + if (IS_ENABLED(CONFIG_SMP)) { + return xchg(pmdp, pmd); + } else { + pmd_t old = *pmdp; + *pmdp = pmd; + return old; + } +} +#endif + /* * clone_pgd_range(pgd_t *dst, pgd_t *src, int count); * From d52605d7cb306aaf86d0e6dede275dbf8a020072 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:18:16 -0800 Subject: [PATCH 068/118] mm: do not lose dirty and accessed bits in pmdp_invalidate() Vlastimil noted that pmdp_invalidate() is not atomic and we can lose dirty and access bits if CPU sets them after pmdp dereference, but before set_pmd_at(). The patch change pmdp_invalidate() to make the entry non-present atomically and return previous value of the entry. This value can be used to check if CPU set dirty/accessed bits under us. The race window is very small and I haven't seen any reports that can be attributed to the bug. For this reason, I don't think backporting to stable trees needed. Link: http://lkml.kernel.org/r/20171213105756.69879-11-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reported-by: Vlastimil Babka Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/pgtable.h | 2 +- mm/pgtable-generic.c | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 118ca2eb7a32..51eebd7546b2 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -325,7 +325,7 @@ static inline pmd_t generic_pmdp_establish(struct vm_area_struct *vma, #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -extern void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif diff --git a/mm/pgtable-generic.c b/mm/pgtable-generic.c index 1e4ee763c190..cf2af04b34b9 100644 --- a/mm/pgtable-generic.c +++ b/mm/pgtable-generic.c @@ -181,12 +181,12 @@ pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) #endif #ifndef __HAVE_ARCH_PMDP_INVALIDATE -void pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, +pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp) { - pmd_t entry = *pmdp; - set_pmd_at(vma->vm_mm, address, pmdp, pmd_mknotpresent(entry)); + pmd_t old = pmdp_establish(vma, address, pmdp, pmd_mknotpresent(*pmdp)); flush_pmd_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + return old; } #endif From a3cf988fcb88301912f95ecf66913502bcb90200 Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Wed, 31 Jan 2018 16:18:20 -0800 Subject: [PATCH 069/118] mm: use updated pmdp_invalidate() interface to track dirty/accessed bits Use the modifed pmdp_invalidate() that returns the previous value of pmd to transfer dirty and accessed bits. Link: http://lkml.kernel.org/r/20171213105756.69879-12-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Aneesh Kumar K.V Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 8 ++++---- mm/huge_memory.c | 29 ++++++++++++----------------- 2 files changed, 16 insertions(+), 21 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 4691f5aca00e..ec6d2983a5cb 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -982,14 +982,14 @@ static inline void clear_soft_dirty(struct vm_area_struct *vma, static inline void clear_soft_dirty_pmd(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmd_t pmd = *pmdp; + pmd_t old, pmd = *pmdp; if (pmd_present(pmd)) { /* See comment in change_huge_pmd() */ - pmdp_invalidate(vma, addr, pmdp); - if (pmd_dirty(*pmdp)) + old = pmdp_invalidate(vma, addr, pmdp); + if (pmd_dirty(old)) pmd = pmd_mkdirty(pmd); - if (pmd_young(*pmdp)) + if (pmd_young(old)) pmd = pmd_mkyoung(pmd); pmd = pmd_wrprotect(pmd); diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 0d3ae51ce4f7..2a79a6b7d19b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1910,17 +1910,7 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * pmdp_invalidate() is required to make sure we don't miss * dirty/young flags set by hardware. */ - entry = *pmd; - pmdp_invalidate(vma, addr, pmd); - - /* - * Recover dirty/young flags. It relies on pmdp_invalidate to not - * corrupt them. - */ - if (pmd_dirty(*pmd)) - entry = pmd_mkdirty(entry); - if (pmd_young(*pmd)) - entry = pmd_mkyoung(entry); + entry = pmdp_invalidate(vma, addr, pmd); entry = pmd_modify(entry, newprot); if (preserve_write) @@ -2073,8 +2063,8 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t _pmd; - bool young, write, dirty, soft_dirty, pmd_migration = false; + pmd_t old, _pmd; + bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2130,7 +2120,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, page_ref_add(page, HPAGE_PMD_NR - 1); write = pmd_write(*pmd); young = pmd_young(*pmd); - dirty = pmd_dirty(*pmd); soft_dirty = pmd_soft_dirty(*pmd); pmdp_huge_split_prepare(vma, haddr, pmd); @@ -2160,8 +2149,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, if (soft_dirty) entry = pte_mksoft_dirty(entry); } - if (dirty) - SetPageDirty(page + i); pte = pte_offset_map(&_pmd, addr); BUG_ON(!pte_none(*pte)); set_pte_at(mm, addr, pte, entry); @@ -2209,7 +2196,15 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, * for this pmd), then we flush the SMP TLB and finally we write the * non-huge version of the pmd entry with pmd_populate. */ - pmdp_invalidate(vma, haddr, pmd); + old = pmdp_invalidate(vma, haddr, pmd); + + /* + * Transfer dirty bit using value returned by pmd_invalidate() to be + * sure we don't race with CPU that can set the bit under us. + */ + if (pmd_dirty(old)) + SetPageDirty(page); + pmd_populate(mm, pmd, pgtable); if (freeze) { From 423ac9af3ceff967a77b0714781033629593b077 Mon Sep 17 00:00:00 2001 From: "Aneesh Kumar K.V" Date: Wed, 31 Jan 2018 16:18:24 -0800 Subject: [PATCH 070/118] mm/thp: remove pmd_huge_split_prepare() Instead of marking the pmd ready for split, invalidate the pmd. This should take care of powerpc requirement. Only side effect is that we mark the pmd invalid early. This can result in us blocking access to the page a bit longer if we race against a thp split. [kirill.shutemov@linux.intel.com: rebased, dirty THP once] Link: http://lkml.kernel.org/r/20171213105756.69879-13-kirill.shutemov@linux.intel.com Signed-off-by: Aneesh Kumar K.V Signed-off-by: Kirill A. Shutemov Cc: Andrea Arcangeli Cc: Catalin Marinas Cc: David Daney Cc: David Miller Cc: H. Peter Anvin Cc: Hugh Dickins Cc: Ingo Molnar Cc: Martin Schwidefsky Cc: Michal Hocko Cc: Nitin Gupta Cc: Ralf Baechle Cc: Thomas Gleixner Cc: Vineet Gupta Cc: Vlastimil Babka Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/powerpc/include/asm/book3s/64/hash-4k.h | 2 - arch/powerpc/include/asm/book3s/64/hash-64k.h | 2 - arch/powerpc/include/asm/book3s/64/pgtable.h | 9 --- arch/powerpc/include/asm/book3s/64/radix.h | 6 -- arch/powerpc/mm/pgtable-hash64.c | 22 ------ include/asm-generic/pgtable.h | 8 --- mm/huge_memory.c | 72 +++++++++---------- 7 files changed, 35 insertions(+), 86 deletions(-) diff --git a/arch/powerpc/include/asm/book3s/64/hash-4k.h b/arch/powerpc/include/asm/book3s/64/hash-4k.h index 197ced1eaaa0..2d9df40446f6 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-4k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-4k.h @@ -101,8 +101,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/hash-64k.h b/arch/powerpc/include/asm/book3s/64/hash-64k.h index 8d40cf03cb67..cb46d1034f33 100644 --- a/arch/powerpc/include/asm/book3s/64/hash-64k.h +++ b/arch/powerpc/include/asm/book3s/64/hash-64k.h @@ -203,8 +203,6 @@ extern pmd_t hash__pmdp_collapse_flush(struct vm_area_struct *vma, extern void hash__pgtable_trans_huge_deposit(struct mm_struct *mm, pmd_t *pmdp, pgtable_t pgtable); extern pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp); -extern void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp); extern pmd_t hash__pmdp_huge_get_and_clear(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp); extern int hash__has_transparent_hugepage(void); diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index ee19d5bbee06..6ca1208cedcb 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -1140,15 +1140,6 @@ static inline pgtable_t pgtable_trans_huge_withdraw(struct mm_struct *mm, extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); -#define __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - if (radix_enabled()) - return radix__pmdp_huge_split_prepare(vma, address, pmdp); - return hash__pmdp_huge_split_prepare(vma, address, pmdp); -} - #define pmd_move_must_withdraw pmd_move_must_withdraw struct spinlock; static inline int pmd_move_must_withdraw(struct spinlock *new_pmd_ptl, diff --git a/arch/powerpc/include/asm/book3s/64/radix.h b/arch/powerpc/include/asm/book3s/64/radix.h index 19c44e1495ae..365010f66570 100644 --- a/arch/powerpc/include/asm/book3s/64/radix.h +++ b/arch/powerpc/include/asm/book3s/64/radix.h @@ -269,12 +269,6 @@ static inline pmd_t radix__pmd_mkhuge(pmd_t pmd) return __pmd(pmd_val(pmd) | _PAGE_PTE | R_PAGE_LARGE); return __pmd(pmd_val(pmd) | _PAGE_PTE); } -static inline void radix__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - /* Nothing to do for radix. */ - return; -} extern unsigned long radix__pmd_hugepage_update(struct mm_struct *mm, unsigned long addr, pmd_t *pmdp, unsigned long clr, diff --git a/arch/powerpc/mm/pgtable-hash64.c b/arch/powerpc/mm/pgtable-hash64.c index ec277913e01b..469808e77e58 100644 --- a/arch/powerpc/mm/pgtable-hash64.c +++ b/arch/powerpc/mm/pgtable-hash64.c @@ -296,28 +296,6 @@ pgtable_t hash__pgtable_trans_huge_withdraw(struct mm_struct *mm, pmd_t *pmdp) return pgtable; } -void hash__pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - VM_BUG_ON(address & ~HPAGE_PMD_MASK); - VM_BUG_ON(REGION_ID(address) != USER_REGION_ID); - VM_BUG_ON(pmd_devmap(*pmdp)); - - /* - * We can't mark the pmd none here, because that will cause a race - * against exit_mmap. We need to continue mark pmd TRANS HUGE, while - * we spilt, but at the same time we wan't rest of the ppc64 code - * not to insert hash pte on this, because we will be modifying - * the deposited pgtable in the caller of this function. Hence - * clear the _PAGE_USER so that we move the fault handling to - * higher level function and that will serialize against ptl. - * We need to flush existing hash pte entries here even though, - * the translation is still valid, because we will withdraw - * pgtable_t after this. - */ - pmd_hugepage_update(vma->vm_mm, address, pmdp, 0, _PAGE_PRIVILEGED); -} - /* * A linux hugepage PMD was changed and the corresponding hash table entries * neesd to be flushed. diff --git a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h index 51eebd7546b2..2cfa3075d148 100644 --- a/include/asm-generic/pgtable.h +++ b/include/asm-generic/pgtable.h @@ -329,14 +329,6 @@ extern pmd_t pmdp_invalidate(struct vm_area_struct *vma, unsigned long address, pmd_t *pmdp); #endif -#ifndef __HAVE_ARCH_PMDP_HUGE_SPLIT_PREPARE -static inline void pmdp_huge_split_prepare(struct vm_area_struct *vma, - unsigned long address, pmd_t *pmdp) -{ - -} -#endif - #ifndef __HAVE_ARCH_PTE_SAME static inline int pte_same(pte_t pte_a, pte_t pte_b) { diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 2a79a6b7d19b..87ab9b8f56b5 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -2063,7 +2063,7 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, struct mm_struct *mm = vma->vm_mm; struct page *page; pgtable_t pgtable; - pmd_t old, _pmd; + pmd_t old_pmd, _pmd; bool young, write, soft_dirty, pmd_migration = false; unsigned long addr; int i; @@ -2106,23 +2106,50 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, return __split_huge_zero_page_pmd(vma, haddr, pmd); } + /* + * Up to this point the pmd is present and huge and userland has the + * whole access to the hugepage during the split (which happens in + * place). If we overwrite the pmd with the not-huge version pointing + * to the pte here (which of course we could if all CPUs were bug + * free), userland could trigger a small page size TLB miss on the + * small sized TLB while the hugepage TLB entry is still established in + * the huge TLB. Some CPU doesn't like that. + * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum + * 383 on page 93. Intel should be safe but is also warns that it's + * only safe if the permission and cache attributes of the two entries + * loaded in the two TLB is identical (which should be the case here). + * But it is generally safer to never allow small and huge TLB entries + * for the same virtual address to be loaded simultaneously. So instead + * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the + * current pmd notpresent (atomically because here the pmd_trans_huge + * must remain set at all times on the pmd until the split is complete + * for this pmd), then we flush the SMP TLB and finally we write the + * non-huge version of the pmd entry with pmd_populate. + */ + old_pmd = pmdp_invalidate(vma, haddr, pmd); + #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION - pmd_migration = is_pmd_migration_entry(*pmd); + pmd_migration = is_pmd_migration_entry(old_pmd); if (pmd_migration) { swp_entry_t entry; - entry = pmd_to_swp_entry(*pmd); + entry = pmd_to_swp_entry(old_pmd); page = pfn_to_page(swp_offset(entry)); } else #endif - page = pmd_page(*pmd); + page = pmd_page(old_pmd); VM_BUG_ON_PAGE(!page_count(page), page); page_ref_add(page, HPAGE_PMD_NR - 1); - write = pmd_write(*pmd); - young = pmd_young(*pmd); - soft_dirty = pmd_soft_dirty(*pmd); + if (pmd_dirty(old_pmd)) + SetPageDirty(page); + write = pmd_write(old_pmd); + young = pmd_young(old_pmd); + soft_dirty = pmd_soft_dirty(old_pmd); - pmdp_huge_split_prepare(vma, haddr, pmd); + /* + * Withdraw the table only after we mark the pmd entry invalid. + * This's critical for some architectures (Power). + */ pgtable = pgtable_trans_huge_withdraw(mm, pmd); pmd_populate(mm, &_pmd, pgtable); @@ -2176,35 +2203,6 @@ static void __split_huge_pmd_locked(struct vm_area_struct *vma, pmd_t *pmd, } smp_wmb(); /* make pte visible before pmd */ - /* - * Up to this point the pmd is present and huge and userland has the - * whole access to the hugepage during the split (which happens in - * place). If we overwrite the pmd with the not-huge version pointing - * to the pte here (which of course we could if all CPUs were bug - * free), userland could trigger a small page size TLB miss on the - * small sized TLB while the hugepage TLB entry is still established in - * the huge TLB. Some CPU doesn't like that. - * See http://support.amd.com/us/Processor_TechDocs/41322.pdf, Erratum - * 383 on page 93. Intel should be safe but is also warns that it's - * only safe if the permission and cache attributes of the two entries - * loaded in the two TLB is identical (which should be the case here). - * But it is generally safer to never allow small and huge TLB entries - * for the same virtual address to be loaded simultaneously. So instead - * of doing "pmd_populate(); flush_pmd_tlb_range();" we first mark the - * current pmd notpresent (atomically because here the pmd_trans_huge - * must remain set at all times on the pmd until the split is complete - * for this pmd), then we flush the SMP TLB and finally we write the - * non-huge version of the pmd entry with pmd_populate. - */ - old = pmdp_invalidate(vma, haddr, pmd); - - /* - * Transfer dirty bit using value returned by pmd_invalidate() to be - * sure we don't race with CPU that can set the bit under us. - */ - if (pmd_dirty(old)) - SetPageDirty(page); - pmd_populate(mm, pmd, pgtable); if (freeze) { From 3b454ad35043dfbd3b5d2bb92b0991d6342afb44 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:18:28 -0800 Subject: [PATCH 071/118] mm: thp: use down_read_trylock() in khugepaged to avoid long block MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In the current design, khugepaged needs to acquire mmap_sem before scanning an mm. But in some corner cases, khugepaged may scan a process which is modifying its memory mapping, so khugepaged blocks in uninterruptible state. But the process might hold the mmap_sem for a long time when modifying a huge memory space and it may trigger the below khugepaged hung issue: INFO: task khugepaged:270 blocked for more than 120 seconds. Tainted: G E 4.9.65-006.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. khugepaged D 0 270 2 0x00000000  ffff883f3deae4c0 0000000000000000 ffff883f610596c0 ffff883f7d359440 ffff883f63818000 ffffc90019adfc78 ffffffff817079a5 d67e5aa8c1860a64 0000000000000246 ffff883f7d359440 ffffc90019adfc88 ffff883f610596c0 Call Trace: schedule+0x36/0x80 rwsem_down_read_failed+0xf0/0x150 call_rwsem_down_read_failed+0x18/0x30 down_read+0x20/0x40 khugepaged+0x476/0x11d0 kthread+0xe6/0x100 ret_from_fork+0x25/0x30 So it sounds pointless to just block khugepaged waiting for the semaphore so replace down_read() with down_read_trylock() to move to scan the next mm quickly instead of just blocking on the semaphore so that other processes can get more chances to install THP. Then khugepaged can come back to scan the skipped mm when it has finished the current round full_scan. And it appears that the change can improve khugepaged efficiency a little bit. Below is the test result when running LTP on a 24 cores 4GB memory 2 nodes NUMA VM: pristine w/ trylock full_scan 197 187 pages_collapsed 21 26 thp_fault_alloc 40818 44466 thp_fault_fallback 18413 16679 thp_collapse_alloc 21 150 thp_collapse_alloc_failed 14 16 thp_file_alloc 369 369 [akpm@linux-foundation.org: coding-style fixes] [akpm@linux-foundation.org: tweak comment] [arnd@arndb.de: avoid uninitialized variable use] Link: http://lkml.kernel.org/r/20171215125129.2948634-1-arnd@arndb.de Link: http://lkml.kernel.org/r/1513281203-54878-1-git-send-email-yang.s@alibaba-inc.com Signed-off-by: Yang Shi Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Andrea Arcangeli Signed-off-by: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/khugepaged.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/mm/khugepaged.c b/mm/khugepaged.c index 1cd18e4347fe..b7e2268dfc9a 100644 --- a/mm/khugepaged.c +++ b/mm/khugepaged.c @@ -1673,10 +1673,14 @@ static unsigned int khugepaged_scan_mm_slot(unsigned int pages, spin_unlock(&khugepaged_mm_lock); mm = mm_slot->mm; - down_read(&mm->mmap_sem); - if (unlikely(khugepaged_test_exit(mm))) - vma = NULL; - else + /* + * Don't wait for semaphore (to avoid long wait times). Just move to + * the next mm on the list. + */ + vma = NULL; + if (unlikely(!down_read_trylock(&mm->mmap_sem))) + goto breakouterloop_mmap_sem; + if (likely(!khugepaged_test_exit(mm))) vma = find_vma(mm, khugepaged_scan.address); progress++; From 5ff7091f5a2ca1b7b642ca0dbdede8f693a56926 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 31 Jan 2018 16:18:32 -0800 Subject: [PATCH 072/118] mm, mmu_notifier: annotate mmu notifiers with blockable invalidate callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Commit 4d4bbd8526a8 ("mm, oom_reaper: skip mm structs with mmu notifiers") prevented the oom reaper from unmapping private anonymous memory with the oom reaper when the oom victim mm had mmu notifiers registered. The rationale is that doing mmu_notifier_invalidate_range_{start,end}() around the unmap_page_range(), which is needed, can block and the oom killer will stall forever waiting for the victim to exit, which may not be possible without reaping. That concern is real, but only true for mmu notifiers that have blockable invalidate_range_{start,end}() callbacks. This patch adds a "flags" field to mmu notifier ops that can set a bit to indicate that these callbacks do not block. The implementation is steered toward an expensive slowpath, such as after the oom reaper has grabbed mm->mmap_sem of a still alive oom victim. [rientjes@google.com: mmu_notifier_invalidate_range_end() can also call the invalidate_range() must not block, fix comment] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1801091339570.240101@chino.kir.corp.google.com [akpm@linux-foundation.org: make mm_has_blockable_invalidate_notifiers() return bool, use rwsem_is_locked()] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1712141329500.74052@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Michal Hocko Acked-by: Paolo Bonzini Acked-by: Christian König Acked-by: Dimitri Sivanich Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Oded Gabbay Cc: Alex Deucher Cc: David Airlie Cc: Joerg Roedel Cc: Doug Ledford Cc: Jani Nikula Cc: Mike Marciniszyn Cc: Sean Hefty Cc: Boris Ostrovsky Cc: Jérôme Glisse Cc: Radim Krčmář Signed-off-by: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/infiniband/hw/hfi1/mmu_rb.c | 1 + drivers/iommu/amd_iommu_v2.c | 1 + drivers/iommu/intel-svm.c | 1 + drivers/misc/sgi-gru/grutlbpurge.c | 1 + include/linux/mmu_notifier.h | 30 +++++++++++++++++++++++++--- mm/mmu_notifier.c | 31 +++++++++++++++++++++++++++++ virt/kvm/kvm_main.c | 1 + 7 files changed, 63 insertions(+), 3 deletions(-) diff --git a/drivers/infiniband/hw/hfi1/mmu_rb.c b/drivers/infiniband/hw/hfi1/mmu_rb.c index e7b3ce123da6..70aceefe14d5 100644 --- a/drivers/infiniband/hw/hfi1/mmu_rb.c +++ b/drivers/infiniband/hw/hfi1/mmu_rb.c @@ -77,6 +77,7 @@ static void do_remove(struct mmu_rb_handler *handler, static void handle_remove(struct work_struct *work); static const struct mmu_notifier_ops mn_opts = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = mmu_notifier_range_start, }; diff --git a/drivers/iommu/amd_iommu_v2.c b/drivers/iommu/amd_iommu_v2.c index 7d94e1d39e5e..df72493a0f13 100644 --- a/drivers/iommu/amd_iommu_v2.c +++ b/drivers/iommu/amd_iommu_v2.c @@ -427,6 +427,7 @@ static void mn_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops iommu_mn = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = mn_release, .clear_flush_young = mn_clear_flush_young, .invalidate_range = mn_invalidate_range, diff --git a/drivers/iommu/intel-svm.c b/drivers/iommu/intel-svm.c index ed1cf7c5a43b..0a826eb7fe48 100644 --- a/drivers/iommu/intel-svm.c +++ b/drivers/iommu/intel-svm.c @@ -276,6 +276,7 @@ static void intel_mm_release(struct mmu_notifier *mn, struct mm_struct *mm) } static const struct mmu_notifier_ops intel_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .release = intel_mm_release, .change_pte = intel_change_pte, .invalidate_range = intel_invalidate_range, diff --git a/drivers/misc/sgi-gru/grutlbpurge.c b/drivers/misc/sgi-gru/grutlbpurge.c index 9918eda0e05f..a3454eb56fbf 100644 --- a/drivers/misc/sgi-gru/grutlbpurge.c +++ b/drivers/misc/sgi-gru/grutlbpurge.c @@ -258,6 +258,7 @@ static void gru_release(struct mmu_notifier *mn, struct mm_struct *mm) static const struct mmu_notifier_ops gru_mmuops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = gru_invalidate_range_start, .invalidate_range_end = gru_invalidate_range_end, .release = gru_release, diff --git a/include/linux/mmu_notifier.h b/include/linux/mmu_notifier.h index b25dc9db19fc..2d07a1ed5a31 100644 --- a/include/linux/mmu_notifier.h +++ b/include/linux/mmu_notifier.h @@ -2,6 +2,7 @@ #ifndef _LINUX_MMU_NOTIFIER_H #define _LINUX_MMU_NOTIFIER_H +#include #include #include #include @@ -10,6 +11,9 @@ struct mmu_notifier; struct mmu_notifier_ops; +/* mmu_notifier_ops flags */ +#define MMU_INVALIDATE_DOES_NOT_BLOCK (0x01) + #ifdef CONFIG_MMU_NOTIFIER /* @@ -26,6 +30,15 @@ struct mmu_notifier_mm { }; struct mmu_notifier_ops { + /* + * Flags to specify behavior of callbacks for this MMU notifier. + * Used to determine which context an operation may be called. + * + * MMU_INVALIDATE_DOES_NOT_BLOCK: invalidate_range_* callbacks do not + * block + */ + int flags; + /* * Called either by mmu_notifier_unregister or when the mm is * being destroyed by exit_mmap, always before all pages are @@ -137,6 +150,10 @@ struct mmu_notifier_ops { * page. Pages will no longer be referenced by the linux * address space but may still be referenced by sptes until * the last refcount is dropped. + * + * If both of these callbacks cannot block, and invalidate_range + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range_start)(struct mmu_notifier *mn, struct mm_struct *mm, @@ -159,12 +176,13 @@ struct mmu_notifier_ops { * external TLB range needs to be flushed. For more in depth * discussion on this see Documentation/vm/mmu_notifier.txt * - * The invalidate_range() function is called under the ptl - * spin-lock and not allowed to sleep. - * * Note that this function might be called with just a sub-range * of what was passed to invalidate_range_start()/end(), if * called between those functions. + * + * If this callback cannot block, and invalidate_range_{start,end} + * cannot block, mmu_notifier_ops.flags should have + * MMU_INVALIDATE_DOES_NOT_BLOCK set. */ void (*invalidate_range)(struct mmu_notifier *mn, struct mm_struct *mm, unsigned long start, unsigned long end); @@ -218,6 +236,7 @@ extern void __mmu_notifier_invalidate_range_end(struct mm_struct *mm, bool only_end); extern void __mmu_notifier_invalidate_range(struct mm_struct *mm, unsigned long start, unsigned long end); +extern bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm); static inline void mmu_notifier_release(struct mm_struct *mm) { @@ -457,6 +476,11 @@ static inline void mmu_notifier_invalidate_range(struct mm_struct *mm, { } +static inline bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + return false; +} + static inline void mmu_notifier_mm_init(struct mm_struct *mm) { } diff --git a/mm/mmu_notifier.c b/mm/mmu_notifier.c index 96edb33fd09a..eff6b88a993f 100644 --- a/mm/mmu_notifier.c +++ b/mm/mmu_notifier.c @@ -236,6 +236,37 @@ void __mmu_notifier_invalidate_range(struct mm_struct *mm, } EXPORT_SYMBOL_GPL(__mmu_notifier_invalidate_range); +/* + * Must be called while holding mm->mmap_sem for either read or write. + * The result is guaranteed to be valid until mm->mmap_sem is dropped. + */ +bool mm_has_blockable_invalidate_notifiers(struct mm_struct *mm) +{ + struct mmu_notifier *mn; + int id; + bool ret = false; + + WARN_ON_ONCE(!rwsem_is_locked(&mm->mmap_sem)); + + if (!mm_has_notifiers(mm)) + return ret; + + id = srcu_read_lock(&srcu); + hlist_for_each_entry_rcu(mn, &mm->mmu_notifier_mm->list, hlist) { + if (!mn->ops->invalidate_range && + !mn->ops->invalidate_range_start && + !mn->ops->invalidate_range_end) + continue; + + if (!(mn->ops->flags & MMU_INVALIDATE_DOES_NOT_BLOCK)) { + ret = true; + break; + } + } + srcu_read_unlock(&srcu, id); + return ret; +} + static int do_mmu_notifier_register(struct mmu_notifier *mn, struct mm_struct *mm, int take_mmap_sem) diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c index d6b9370806f8..35db929f92f0 100644 --- a/virt/kvm/kvm_main.c +++ b/virt/kvm/kvm_main.c @@ -476,6 +476,7 @@ static void kvm_mmu_notifier_release(struct mmu_notifier *mn, } static const struct mmu_notifier_ops kvm_mmu_notifier_ops = { + .flags = MMU_INVALIDATE_DOES_NOT_BLOCK, .invalidate_range_start = kvm_mmu_notifier_invalidate_range_start, .invalidate_range_end = kvm_mmu_notifier_invalidate_range_end, .clear_flush_young = kvm_mmu_notifier_clear_flush_young, From f340ff820345b179b697f66ec6743c70416bf93f Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 31 Jan 2018 16:18:36 -0800 Subject: [PATCH 073/118] mm, oom: avoid reaping only for mm's with blockable invalidate callbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This uses the new annotation to determine if an mm has mmu notifiers with blockable invalidate range callbacks to avoid oom reaping. Otherwise, the callbacks are used around unmap_page_range(). Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1712141330120.74052@chino.kir.corp.google.com Signed-off-by: David Rientjes Acked-by: Michal Hocko Cc: Paolo Bonzini Cc: Christian König Cc: Dimitri Sivanich Cc: Andrea Arcangeli Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Oded Gabbay Cc: Alex Deucher Cc: David Airlie Cc: Joerg Roedel Cc: Doug Ledford Cc: Jani Nikula Cc: Mike Marciniszyn Cc: Sean Hefty Cc: Boris Ostrovsky Cc: Jérôme Glisse Cc: Radim Krčmář Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/oom_kill.c | 21 +++++++++++---------- 1 file changed, 11 insertions(+), 10 deletions(-) diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 29f855551efe..f2e7dfb81eee 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -514,15 +514,12 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) } /* - * If the mm has notifiers then we would need to invalidate them around - * unmap_page_range and that is risky because notifiers can sleep and - * what they do is basically undeterministic. So let's have a short + * If the mm has invalidate_{start,end}() notifiers that could block, * sleep to give the oom victim some more time. * TODO: we really want to get rid of this ugly hack and make sure that - * notifiers cannot block for unbounded amount of time and add - * mmu_notifier_invalidate_range_{start,end} around unmap_page_range + * notifiers cannot block for unbounded amount of time */ - if (mm_has_notifiers(mm)) { + if (mm_has_blockable_invalidate_notifiers(mm)) { up_read(&mm->mmap_sem); schedule_timeout_idle(HZ); goto unlock_oom; @@ -565,10 +562,14 @@ static bool __oom_reap_task_mm(struct task_struct *tsk, struct mm_struct *mm) * count elevated without a good reason. */ if (vma_is_anonymous(vma) || !(vma->vm_flags & VM_SHARED)) { - tlb_gather_mmu(&tlb, mm, vma->vm_start, vma->vm_end); - unmap_page_range(&tlb, vma, vma->vm_start, vma->vm_end, - NULL); - tlb_finish_mmu(&tlb, vma->vm_start, vma->vm_end); + const unsigned long start = vma->vm_start; + const unsigned long end = vma->vm_end; + + tlb_gather_mmu(&tlb, mm, start, end); + mmu_notifier_invalidate_range_start(mm, start, end); + unmap_page_range(&tlb, vma, start, end, NULL); + mmu_notifier_invalidate_range_end(mm, start, end); + tlb_finish_mmu(&tlb, start, end); } } pr_info("oom_reaper: reaped process %d (%s), now anon-rss:%lukB, file-rss:%lukB, shmem-rss:%lukB\n", From 93144ca35041b05a4b23528d3bdf0d6414f43002 Mon Sep 17 00:00:00 2001 From: Aliaksei Karaliou Date: Wed, 31 Jan 2018 16:18:40 -0800 Subject: [PATCH 074/118] mm/zsmalloc: simplify shrinker init/destroy Structure zs_pool has special flag to indicate success of shrinker initialization. unregister_shrinker() has improved and can detect by itself whether actual deinitialization should be performed or not, so extra flag becomes redundant. [akpm@linux-foundation.org: update comment (Aliaksei), remove unneeded cast] Link: http://lkml.kernel.org/r/1513680552-9798-1-git-send-email-akaraliou.dev@gmail.com Signed-off-by: Aliaksei Karaliou Reviewed-by: Sergey Senozhatsky Acked-by: Minchan Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 22 +++++++++------------- 1 file changed, 9 insertions(+), 13 deletions(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index 683c0651098c..e136a8e72c48 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -257,11 +258,7 @@ struct zs_pool { /* Compact classes */ struct shrinker shrinker; - /* - * To signify that register_shrinker() was successful - * and unregister_shrinker() will not Oops. - */ - bool shrinker_enabled; + #ifdef CONFIG_ZSMALLOC_STAT struct dentry *stat_dentry; #endif @@ -2324,10 +2321,7 @@ static unsigned long zs_shrinker_count(struct shrinker *shrinker, static void zs_unregister_shrinker(struct zs_pool *pool) { - if (pool->shrinker_enabled) { - unregister_shrinker(&pool->shrinker); - pool->shrinker_enabled = false; - } + unregister_shrinker(&pool->shrinker); } static int zs_register_shrinker(struct zs_pool *pool) @@ -2426,11 +2420,13 @@ struct zs_pool *zs_create_pool(const char *name) goto err; /* - * Not critical, we still can use the pool - * and user can trigger compaction manually. + * Not critical since shrinker is only used to trigger internal + * defragmentation of the pool which is pretty optional thing. If + * registration fails we still can use the pool normally and user can + * trigger compaction manually. Thus, ignore return code. */ - if (zs_register_shrinker(pool) == 0) - pool->shrinker_enabled = true; + zs_register_shrinker(pool); + return pool; err: From e20df2c6a86cf8e2caeb3665427d077bfb97f177 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:44 -0800 Subject: [PATCH 075/118] mm: align struct page more aesthetically Patch series "Restructure struct page", v2. This series does not attempt any grand restructuring. Instead, it cures the worst of the indentitis, fixes the documentation and reduces the ifdeffery. The only layout change is compound_dtor and compound_order are each reduced to one byte. This patch (of 8): Instead of an ifdef block at the end of the struct, which needed its own comment, define _struct_page_alignment up at the top where it fits nicely with the existing comment. Link: http://lkml.kernel.org/r/20171220155552.15884-2-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index cfd0ac4e5e0e..4509f0cfaf39 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -39,6 +39,12 @@ struct hmm; * allows the use of atomic double word operations on the flags/mapping * and lru list pointers also. */ +#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE +#define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#else +#define _struct_page_alignment +#endif + struct page { /* First double word block */ unsigned long flags; /* Atomic flags, some possibly @@ -212,15 +218,7 @@ struct page { #ifdef LAST_CPUPID_NOT_IN_PAGE_FLAGS int _last_cpupid; #endif -} -/* - * The struct page can be forced to be double word aligned so that atomic ops - * on double words work. The SLUB allocator can make use of such a feature. - */ -#ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE - __aligned(2 * sizeof(unsigned long)) -#endif -; +} _struct_page_alignment; #define PAGE_FRAG_CACHE_MAX_SIZE __ALIGN_MASK(32768, ~PAGE_MASK) #define PAGE_FRAG_CACHE_MAX_ORDER get_order(PAGE_FRAG_CACHE_MAX_SIZE) From ca9c88c781b8e5d837068db6d1ca8e775fb7e154 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:47 -0800 Subject: [PATCH 076/118] mm: de-indent struct page I found the struct { union { struct { union { struct { } } } } } layout rather confusing. Fortunately, there is an easier way to write this. The innermost union is of four things which are the size of an int, so the ones which are used by slab/slob/slub can be pulled up two levels to be in the outermost union with 'counters'. That leaves us with struct { union { struct { atomic_t; atomic_t; } } } which has the same layout, but is easier to read. Output from the current git version of pahole, diffed with -uw to ignore the whitespace changes from the indentation: }; /* 16 8 */ union { long unsigned int counters; /* 24 8 */ - struct { - union { - atomic_t _mapcount; /* 24 4 */ unsigned int active; /* 24 4 */ struct { unsigned int inuse:16; /* 24:16 4 */ @@ -21,7 +18,8 @@ unsigned int frozen:1; /* 24: 0 4 */ }; /* 24 4 */ int units; /* 24 4 */ - }; /* 24 4 */ + struct { + atomic_t _mapcount; /* 24 4 */ atomic_t _refcount; /* 28 4 */ }; /* 24 8 */ }; /* 24 8 */ Link: http://lkml.kernel.org/r/20171220155552.15884-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 38 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 20 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 4509f0cfaf39..27973166af28 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -84,28 +84,26 @@ struct page { */ unsigned counters; #endif - struct { + unsigned int active; /* SLAB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + int units; /* SLOB */ - union { - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. - */ - atomic_t _mapcount; + struct { /* Page cache */ + /* + * Count of ptes mapped in mms, to show when + * page is mapped & limit reverse map searches. + * + * Extra information about page type may be + * stored here for pages that are never mapped, + * in which case the value MUST BE <= -2. + * See page-flags.h for more details. + */ + atomic_t _mapcount; - unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; - int units; /* SLOB */ - }; /* * Usage count, *USE WRAPPER FUNCTION* when manual * accounting. See page_ref.h From 4cf7c8bfb36f4b4dbc333bf844ea801d089f44f8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:51 -0800 Subject: [PATCH 077/118] mm: remove misleading alignment claims The "third double word block" isn't on 32-bit systems. The layout looks like this: unsigned long flags; struct address_space *mapping pgoff_t index; atomic_t _mapcount; atomic_t _refcount; which is 32 bytes on 64-bit, but 20 bytes on 32-bit. Nobody is trying to use the fact that it's double-word aligned today, so just remove the misleading claims. Link: http://lkml.kernel.org/r/20171220155552.15884-4-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Christoph Lameter Cc: Michal Hocko Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 27973166af28..c2294e6204e8 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,11 +33,11 @@ struct hmm; * a page, though if it is a pagecache page, rmap structures can tell us * who is mapping it. * - * The objects in struct page are organized in double word blocks in - * order to allows us to use atomic double word operations on portions - * of struct page. That is currently only used by slub but the arrangement - * allows the use of atomic double word operations on the flags/mapping - * and lru list pointers also. + * SLUB uses cmpxchg_double() to atomically update its freelist and + * counters. That requires that freelist & counters be adjacent and + * double-word aligned. We align all struct pages to double-word + * boundaries, and ensure that 'freelist' is aligned within the + * struct. */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) @@ -113,8 +113,6 @@ struct page { }; /* - * Third double word block - * * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to * avoid collision and false-positive PageTail(). @@ -175,7 +173,6 @@ struct page { #endif }; - /* Remainder is not double word aligned */ union { unsigned long private; /* Mapping-private opaque data: * usually used for buffer_heads From b26435a0115b245ea2dd705efcce877ec417bc74 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:55 -0800 Subject: [PATCH 078/118] mm: improve comment on page->mapping The comment on page->mapping is terse, and out of date (it does not mention the possibility of PAGE_MAPPING_MOVABLE). Instead, point the interested reader to page-flags.h where there is a much better comment. Link: http://lkml.kernel.org/r/20171220155552.15884-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index c2294e6204e8..8c3b8cea22ee 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -50,15 +50,9 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ union { - struct address_space *mapping; /* If low bit clear, points to - * inode address_space, or NULL. - * If page mapped as anonymous - * memory, low bit is set, and - * it points to anon_vma object - * or KSM private structure. See - * PAGE_MAPPING_ANON and - * PAGE_MAPPING_KSM. - */ + /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + void *s_mem; /* slab first object */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ From 0dd4da5b110c6915d4244b8ed87a1c8d3945224b Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:18:58 -0800 Subject: [PATCH 079/118] mm: introduce _slub_counter_t Instead of putting the ifdef in the middle of the definition of struct page, pull it forward to the rest of the ifdeffery around the SLUB cmpxchg_double optimisation. Link: http://lkml.kernel.org/r/20171220155552.15884-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 8c3b8cea22ee..5521c9799c50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -41,9 +41,15 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) +#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) +#define _slub_counter_t unsigned long #else -#define _struct_page_alignment +#define _slub_counter_t unsigned int #endif +#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#define _struct_page_alignment +#define _slub_counter_t unsigned int +#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ struct page { /* First double word block */ @@ -66,18 +72,7 @@ struct page { }; union { -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) && \ - defined(CONFIG_HAVE_ALIGNED_STRUCT_PAGE) - /* Used for cmpxchg_double in slub */ - unsigned long counters; -#else - /* - * Keep _refcount separate from slub cmpxchg_double data. - * As the rest of the double word is protected by slab_lock - * but _refcount is not. - */ - unsigned counters; -#endif + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ unsigned inuse:16; From 036e7aa49fb29e0b49b99a56fa5611d4a5b99fb1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:02 -0800 Subject: [PATCH 080/118] mm: store compound_dtor / compound_order as bytes Neither of these values get even close to 256; compound_dtor is currently at a maximum of 3, and compound_order can't be over 64. No machine has inefficient access to bytes since EV5, and while those are still supported, we don't optimise for them any more. This does not shrink struct page, but it removes an ifdef and frees up 2-6 bytes for future use. diff of pahole output: struct callback_head callback_head; /* 32 16 */ struct { long unsigned int compound_head; /* 32 8 */ - unsigned int compound_dtor; /* 40 4 */ - unsigned int compound_order; /* 44 4 */ + unsigned char compound_dtor; /* 40 1 */ + unsigned char compound_order; /* 41 1 */ }; /* 32 16 */ }; /* 32 16 */ union { [mawilcox@microsoft.com: add comment] Link: http://lkml.kernel.org/r/20171221000144.GB2980@bombadil.infradead.org Link: http://lkml.kernel.org/r/20171220155552.15884-7-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Cc: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 16 +++------------- 1 file changed, 3 insertions(+), 13 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5521c9799c50..3e7e99784656 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -136,19 +136,9 @@ struct page { unsigned long compound_head; /* If bit zero is set */ /* First tail page only */ -#ifdef CONFIG_64BIT - /* - * On 64 bit system we have enough space in struct page - * to encode compound_dtor and compound_order with - * unsigned int. It can help compiler generate better or - * smaller code on some archtectures. - */ - unsigned int compound_dtor; - unsigned int compound_order; -#else - unsigned short int compound_dtor; - unsigned short int compound_order; -#endif + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ }; #if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS From be50015d7eec0e96b312468291d8209c1cc49908 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:06 -0800 Subject: [PATCH 081/118] mm: document how to use struct page Be really explicit about what bits / bytes are reserved for users that want to store extra information about the pages they allocate. Link: http://lkml.kernel.org/r/20171220155552.15884-8-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Reviewed-by: Randy Dunlap Acked-by: Michal Hocko Cc: Christoph Lameter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 24 +++++++++++++++++++++++- 1 file changed, 23 insertions(+), 1 deletion(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3e7e99784656..3f1fae8fb140 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -31,7 +31,29 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. + * who is mapping it. If you allocate the page using alloc_pages(), you + * can use some of the space in struct page for your own purposes. + * + * Pages that were once in the page cache may be found under the RCU lock + * even after they have been recycled to a different purpose. The page + * cache reads and writes some of the fields in struct page to pin the + * page before checking that it's still in the page cache. It is vital + * that all users of struct page: + * 1. Use the first word as PageFlags. + * 2. Clear or preserve bit 0 of page->compound_head. It is used as + * PageTail for compound pages, and the page cache must not see false + * positives. Some users put a pointer here (guaranteed to be at least + * 4-byte aligned), other users avoid using the field altogether. + * 3. page->_refcount must either not be used, or must be used in such a + * way that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * + * If you allocate pages of order > 0, you can use the fields in the struct + * page associated with each page, but bear in mind that the pages may have + * been inserted individually into the page cache, so you must use the above + * four fields in a compatible way for each struct page. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and From ab8928b72fd77d936034da4c077f1580619697f4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Wed, 31 Jan 2018 16:19:11 -0800 Subject: [PATCH 082/118] mm: remove reference to PG_buddy PG_buddy doesn't exist any more. It's called PageBuddy now. Link: http://lkml.kernel.org/r/20171220155552.15884-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Michal Hocko Acked-by: Christoph Lameter Cc: Randy Dunlap Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3f1fae8fb140..fd1af6b9591d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -175,13 +175,13 @@ struct page { }; union { - unsigned long private; /* Mapping-private opaque data: - * usually used for buffer_heads - * if PagePrivate set; used for - * swp_entry_t if PageSwapCache; - * indicates order in the buddy - * system if PG_buddy is set. - */ + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; #if USE_SPLIT_PTE_PTLOCKS #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; From e9d586a8217882eb4068e3ed94a5234ba6dead34 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:14 -0800 Subject: [PATCH 083/118] shmem: unexport shmem_add_seals()/shmem_get_seals() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "memfd: add sealing to hugetlb-backed memory", v3. Recently, Mike Kravetz added hugetlbfs support to memfd. However, he didn't add sealing support. One of the reasons to use memfd is to have shared memory sealing when doing IPC or sharing memory with another process with some extra safety. qemu uses shared memory & hugetables with vhost-user (used by dpdk), so it is reasonable to use memfd now instead for convenience and security reasons. This patch (of 9): The functions are called through shmem_fcntl() only. And no danger in removing the EXPORTs as the routines only work with shmem file structs. Link: http://lkml.kernel.org/r/20171107122800.25517-2-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/shmem_fs.h | 2 -- mm/shmem.c | 6 ++---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 06b295bec00d..e464815a7e4c 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,8 +112,6 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern int shmem_add_seals(struct file *file, unsigned int seals); -extern int shmem_get_seals(struct file *file); extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else diff --git a/mm/shmem.c b/mm/shmem.c index 7fbe67be86fa..975efd81621f 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2722,7 +2722,7 @@ continue_resched: F_SEAL_GROW | \ F_SEAL_WRITE) -int shmem_add_seals(struct file *file, unsigned int seals) +static int shmem_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2791,16 +2791,14 @@ unlock: inode_unlock(inode); return error; } -EXPORT_SYMBOL_GPL(shmem_add_seals); -int shmem_get_seals(struct file *file) +static int shmem_get_seals(struct file *file) { if (file->f_op != &shmem_file_operations) return -EINVAL; return SHMEM_I(file_inode(file))->seals; } -EXPORT_SYMBOL_GPL(shmem_get_seals); long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { From 5aadc431a593ac1f3a026dfbceaa16cc4d5e15ca Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:18 -0800 Subject: [PATCH 084/118] shmem: rename functions that are memfd-related MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Those functions are called for memfd files, backed by shmem or hugetlb (the next patches will handle hugetlb). Link: http://lkml.kernel.org/r/20171107122800.25517-3-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/fcntl.c | 2 +- include/linux/shmem_fs.h | 4 ++-- mm/shmem.c | 10 +++++----- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/fs/fcntl.c b/fs/fcntl.c index c7b9e0948107..e95fa0a352ea 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -418,7 +418,7 @@ static long do_fcntl(int fd, unsigned int cmd, unsigned long arg, break; case F_ADD_SEALS: case F_GET_SEALS: - err = shmem_fcntl(filp, cmd, arg); + err = memfd_fcntl(filp, cmd, arg); break; case F_GET_RW_HINT: case F_SET_RW_HINT: diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index e464815a7e4c..73b5e655a76e 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -112,11 +112,11 @@ extern void shmem_uncharge(struct inode *inode, long pages); #ifdef CONFIG_TMPFS -extern long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); #else -static inline long shmem_fcntl(struct file *f, unsigned int c, unsigned long a) +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) { return -EINVAL; } diff --git a/mm/shmem.c b/mm/shmem.c index 975efd81621f..86d7e06ee855 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2722,7 +2722,7 @@ continue_resched: F_SEAL_GROW | \ F_SEAL_WRITE) -static int shmem_add_seals(struct file *file, unsigned int seals) +static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); struct shmem_inode_info *info = SHMEM_I(inode); @@ -2792,7 +2792,7 @@ unlock: return error; } -static int shmem_get_seals(struct file *file) +static int memfd_get_seals(struct file *file) { if (file->f_op != &shmem_file_operations) return -EINVAL; @@ -2800,7 +2800,7 @@ static int shmem_get_seals(struct file *file) return SHMEM_I(file_inode(file))->seals; } -long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) { long error; @@ -2810,10 +2810,10 @@ long shmem_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (arg > UINT_MAX) return -EINVAL; - error = shmem_add_seals(file, arg); + error = memfd_add_seals(file, arg); break; case F_GET_SEALS: - error = shmem_get_seals(file); + error = memfd_get_seals(file); break; default: error = -EINVAL; From da14c1e524a56d62b846f73ae44fd722d63747b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:22 -0800 Subject: [PATCH 085/118] hugetlb: expose hugetlbfs_inode_info in header MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit hugetlbfs inode information will need to be accessed by code in mm/shmem.c for file sealing operations. Move inode information definition from .c file to header for needed access. Link: http://lkml.kernel.org/r/20171107122800.25517-4-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 10 ---------- include/linux/hugetlb.h | 10 ++++++++++ 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8a85f3f53446..89e29574c1dc 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -55,16 +55,6 @@ struct hugetlbfs_config { umode_t mode; }; -struct hugetlbfs_inode_info { - struct shared_policy policy; - struct inode vfs_inode; -}; - -static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) -{ - return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); -} - int sysctl_hugetlb_shm_group; enum { diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 6fcf140188d0..d02301e3f232 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -270,6 +270,16 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) return sb->s_fs_info; } +struct hugetlbfs_inode_info { + struct shared_policy policy; + struct inode vfs_inode; +}; + +static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) +{ + return container_of(inode, struct hugetlbfs_inode_info, vfs_inode); +} + extern const struct file_operations hugetlbfs_file_operations; extern const struct vm_operations_struct hugetlb_vm_ops; struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, From ff62a34210441103108d435ae8a00a777c4dcb99 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:25 -0800 Subject: [PATCH 086/118] hugetlb: implement memfd sealing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implements memfd sealing, similar to shmem: - WRITE: deny fallocate(PUNCH_HOLE). mmap() write is denied in memfd_add_seals(). write() doesn't exist for hugetlbfs. - SHRINK: added similar check as shmem_setattr() - GROW: added similar check as shmem_setattr() & shmem_fallocate() Except write() operation that doesn't exist with hugetlbfs, that should make sealing as close as it can be to shmem support. Link: http://lkml.kernel.org/r/20171107122800.25517-5-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/hugetlbfs/inode.c | 29 +++++++++++++++++++++++++++-- include/linux/hugetlb.h | 1 + 2 files changed, 28 insertions(+), 2 deletions(-) diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 89e29574c1dc..8fe1b0aa2896 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -510,8 +510,16 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); inode_lock(inode); + + /* protected by i_mutex */ + if (info->seals & F_SEAL_WRITE) { + inode_unlock(inode); + return -EPERM; + } + i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap.rb_root)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -529,6 +537,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); struct address_space *mapping = inode->i_mapping; struct hstate *h = hstate_inode(inode); struct vm_area_struct pseudo_vma; @@ -560,6 +569,11 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, if (error) goto out; + if ((info->seals & F_SEAL_GROW) && offset + len > inode->i_size) { + error = -EPERM; + goto out; + } + /* * Initialize a pseudo vma as this is required by the huge page * allocation routines. If NUMA is configured, use page index @@ -650,6 +664,7 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) struct hstate *h = hstate_inode(inode); int error; unsigned int ia_valid = attr->ia_valid; + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); BUG_ON(!inode); @@ -658,9 +673,16 @@ static int hugetlbfs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE) { - if (attr->ia_size & ~huge_page_mask(h)) + loff_t oldsize = inode->i_size; + loff_t newsize = attr->ia_size; + + if (newsize & ~huge_page_mask(h)) return -EINVAL; - error = hugetlb_vmtruncate(inode, attr->ia_size); + /* protected by i_mutex */ + if ((newsize < oldsize && (info->seals & F_SEAL_SHRINK)) || + (newsize > oldsize && (info->seals & F_SEAL_GROW))) + return -EPERM; + error = hugetlb_vmtruncate(inode, newsize); if (error) return error; } @@ -712,6 +734,8 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode = new_inode(sb); if (inode) { + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); + inode->i_ino = get_next_ino(); inode_init_owner(inode, dir, mode); lockdep_set_class(&inode->i_mapping->i_mmap_rwsem, @@ -719,6 +743,7 @@ static struct inode *hugetlbfs_get_inode(struct super_block *sb, inode->i_mapping->a_ops = &hugetlbfs_aops; inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); inode->i_mapping->private_data = resv_map; + info->seals = F_SEAL_SEAL; switch (mode & S_IFMT) { default: init_special_inode(inode, mode, dev); diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index d02301e3f232..944e6e8bd572 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -273,6 +273,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) struct hugetlbfs_inode_info { struct shared_policy policy; struct inode vfs_inode; + unsigned int seals; }; static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) From 47b9012ecdc747f6936395265e677d41e11a31ff Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:29 -0800 Subject: [PATCH 087/118] shmem: add sealing support to hugetlb-backed memfd MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adapt add_seals()/get_seals() to work with hugetbfs-backed memory. Teach memfd_create() to allow sealing operations on MFD_HUGETLB. Link: http://lkml.kernel.org/r/20171107122800.25517-6-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 47 ++++++++++++++++++++++++++++------------------- 1 file changed, 28 insertions(+), 19 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 86d7e06ee855..1907688b75ee 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2717,6 +2717,19 @@ continue_resched: return error; } +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (file->f_op == &shmem_file_operations) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (file->f_op == &hugetlbfs_file_operations) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + #define F_ALL_SEALS (F_SEAL_SEAL | \ F_SEAL_SHRINK | \ F_SEAL_GROW | \ @@ -2725,7 +2738,7 @@ continue_resched: static int memfd_add_seals(struct file *file, unsigned int seals) { struct inode *inode = file_inode(file); - struct shmem_inode_info *info = SHMEM_I(inode); + unsigned int *file_seals; int error; /* @@ -2758,8 +2771,6 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * other file types. */ - if (file->f_op != &shmem_file_operations) - return -EINVAL; if (!(file->f_mode & FMODE_WRITE)) return -EPERM; if (seals & ~(unsigned int)F_ALL_SEALS) @@ -2767,12 +2778,18 @@ static int memfd_add_seals(struct file *file, unsigned int seals) inode_lock(inode); - if (info->seals & F_SEAL_SEAL) { + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { error = -EPERM; goto unlock; } - if ((seals & F_SEAL_WRITE) && !(info->seals & F_SEAL_WRITE)) { + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { error = mapping_deny_writable(file->f_mapping); if (error) goto unlock; @@ -2784,7 +2801,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) } } - info->seals |= seals; + *file_seals |= seals; error = 0; unlock: @@ -2794,10 +2811,9 @@ unlock: static int memfd_get_seals(struct file *file) { - if (file->f_op != &shmem_file_operations) - return -EINVAL; + unsigned int *seals = memfd_file_seals_ptr(file); - return SHMEM_I(file_inode(file))->seals; + return seals ? *seals : -EINVAL; } long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) @@ -3655,7 +3671,7 @@ SYSCALL_DEFINE2(memfd_create, const char __user *, uname, unsigned int, flags) { - struct shmem_inode_info *info; + unsigned int *file_seals; struct file *file; int fd, error; char *name; @@ -3665,9 +3681,6 @@ SYSCALL_DEFINE2(memfd_create, if (flags & ~(unsigned int)MFD_ALL_FLAGS) return -EINVAL; } else { - /* Sealing not supported in hugetlbfs (MFD_HUGETLB) */ - if (flags & MFD_ALLOW_SEALING) - return -EINVAL; /* Allow huge page size encoding in flags. */ if (flags & ~(unsigned int)(MFD_ALL_FLAGS | (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) @@ -3720,12 +3733,8 @@ SYSCALL_DEFINE2(memfd_create, file->f_flags |= O_RDWR | O_LARGEFILE; if (flags & MFD_ALLOW_SEALING) { - /* - * flags check at beginning of function ensures - * this is not a hugetlbfs (MFD_HUGETLB) file. - */ - info = SHMEM_I(file_inode(file)); - info->seals &= ~F_SEAL_SEAL; + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; } fd_install(fd, file); From 724978457fe29626bb6ce78e6c324c9d2648dcf7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:32 -0800 Subject: [PATCH 088/118] memfd-test: test hugetlbfs sealing MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Remove most of the special-casing of hugetlbfs now that sealing is supported. Link: http://lkml.kernel.org/r/20171107122800.25517-7-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/memfd_test.c | 148 ++------------------- 1 file changed, 14 insertions(+), 134 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 132a54f74e88..59ca090e9752 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -513,6 +513,10 @@ static void mfd_assert_grow_write(int fd) static char *buf; ssize_t l; + /* hugetlbfs does not support write */ + if (hugetlbfs_test) + return; + buf = malloc(mfd_def_size * 8); if (!buf) { printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); @@ -533,6 +537,10 @@ static void mfd_fail_grow_write(int fd) static char *buf; ssize_t l; + /* hugetlbfs does not support write */ + if (hugetlbfs_test) + return; + buf = malloc(mfd_def_size * 8); if (!buf) { printf("malloc(%zu) failed: %m\n", mfd_def_size * 8); @@ -627,18 +635,13 @@ static void test_create(void) fd = mfd_assert_new("", 0, MFD_CLOEXEC); close(fd); - if (!hugetlbfs_test) { - /* verify MFD_ALLOW_SEALING is allowed */ - fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); - close(fd); + /* verify MFD_ALLOW_SEALING is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING); + close(fd); - /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ - fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); - close(fd); - } else { - /* sealing is not supported on hugetlbfs */ - mfd_fail_new("", MFD_ALLOW_SEALING); - } + /* verify MFD_ALLOW_SEALING | MFD_CLOEXEC is allowed */ + fd = mfd_assert_new("", 0, MFD_ALLOW_SEALING | MFD_CLOEXEC); + close(fd); } /* @@ -649,10 +652,6 @@ static void test_basic(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - printf("%s BASIC\n", MEMFD_STR); fd = mfd_assert_new("kern_memfd_basic", @@ -697,28 +696,6 @@ static void test_basic(void) close(fd); } -/* - * hugetlbfs doesn't support seals or write, so just verify grow and shrink - * on a hugetlbfs file created via memfd_create. - */ -static void test_hugetlbfs_grow_shrink(void) -{ - int fd; - - printf("%s HUGETLBFS-GROW-SHRINK\n", MEMFD_STR); - - fd = mfd_assert_new("kern_memfd_seal_write", - mfd_def_size, - MFD_CLOEXEC); - - mfd_assert_read(fd); - mfd_assert_write(fd); - mfd_assert_shrink(fd); - mfd_assert_grow(fd); - - close(fd); -} - /* * Test SEAL_WRITE * Test whether SEAL_WRITE actually prevents modifications. @@ -727,13 +704,6 @@ static void test_seal_write(void) { int fd; - /* - * hugetlbfs does not contain sealing or write support. Just test - * basic grow and shrink via test_hugetlbfs_grow_shrink. - */ - if (hugetlbfs_test) - return test_hugetlbfs_grow_shrink(); - printf("%s SEAL-WRITE\n", MEMFD_STR); fd = mfd_assert_new("kern_memfd_seal_write", @@ -760,10 +730,6 @@ static void test_seal_shrink(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - printf("%s SEAL-SHRINK\n", MEMFD_STR); fd = mfd_assert_new("kern_memfd_seal_shrink", @@ -790,10 +756,6 @@ static void test_seal_grow(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - printf("%s SEAL-GROW\n", MEMFD_STR); fd = mfd_assert_new("kern_memfd_seal_grow", @@ -820,10 +782,6 @@ static void test_seal_resize(void) { int fd; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - printf("%s SEAL-RESIZE\n", MEMFD_STR); fd = mfd_assert_new("kern_memfd_seal_resize", @@ -842,32 +800,6 @@ static void test_seal_resize(void) close(fd); } -/* - * hugetlbfs does not support seals. Basic test to dup the memfd created - * fd and perform some basic operations on it. - */ -static void hugetlbfs_dup(char *b_suffix) -{ - int fd, fd2; - - printf("%s HUGETLBFS-DUP %s\n", MEMFD_STR, b_suffix); - - fd = mfd_assert_new("kern_memfd_share_dup", - mfd_def_size, - MFD_CLOEXEC); - - fd2 = mfd_assert_dup(fd); - - mfd_assert_read(fd); - mfd_assert_write(fd); - - mfd_assert_shrink(fd2); - mfd_assert_grow(fd2); - - close(fd2); - close(fd); -} - /* * Test sharing via dup() * Test that seals are shared between dupped FDs and they're all equal. @@ -876,15 +808,6 @@ static void test_share_dup(char *banner, char *b_suffix) { int fd, fd2; - /* - * hugetlbfs does not contain sealing support. Perform some - * basic testing on dup'ed fd instead via hugetlbfs_dup. - */ - if (hugetlbfs_test) { - hugetlbfs_dup(b_suffix); - return; - } - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_dup", @@ -927,10 +850,6 @@ static void test_share_mmap(char *banner, char *b_suffix) int fd; void *p; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_mmap", @@ -955,32 +874,6 @@ static void test_share_mmap(char *banner, char *b_suffix) close(fd); } -/* - * Basic test to make sure we can open the hugetlbfs fd via /proc and - * perform some simple operations on it. - */ -static void hugetlbfs_proc_open(char *b_suffix) -{ - int fd, fd2; - - printf("%s HUGETLBFS-PROC-OPEN %s\n", MEMFD_STR, b_suffix); - - fd = mfd_assert_new("kern_memfd_share_open", - mfd_def_size, - MFD_CLOEXEC); - - fd2 = mfd_assert_open(fd, O_RDWR, 0); - - mfd_assert_read(fd); - mfd_assert_write(fd); - - mfd_assert_shrink(fd2); - mfd_assert_grow(fd2); - - close(fd2); - close(fd); -} - /* * Test sealing with open(/proc/self/fd/%d) * Via /proc we can get access to a separate file-context for the same memfd. @@ -991,15 +884,6 @@ static void test_share_open(char *banner, char *b_suffix) { int fd, fd2; - /* - * hugetlbfs does not contain sealing support. So test basic - * functionality of using /proc fd via hugetlbfs_proc_open - */ - if (hugetlbfs_test) { - hugetlbfs_proc_open(b_suffix); - return; - } - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_open", @@ -1043,10 +927,6 @@ static void test_share_fork(char *banner, char *b_suffix) int fd; pid_t pid; - /* hugetlbfs does not contain sealing support */ - if (hugetlbfs_test) - return; - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_fork", From 3037aeb99134b0907fe0901055570a329b1f583d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:36 -0800 Subject: [PATCH 089/118] memfd-test: add 'memfd-hugetlb:' prefix when testing hugetlbfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: http://lkml.kernel.org/r/20171107122800.25517-8-marcandre.lureau@redhat.com Suggested-by: Mike Kravetz Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/memfd_test.c | 26 +++++++++++++--------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 59ca090e9752..910c55f858bb 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -20,6 +20,7 @@ #include #define MEMFD_STR "memfd:" +#define MEMFD_HUGE_STR "memfd-hugetlb:" #define SHARED_FT_STR "(shared file-table)" #define MFD_DEF_SIZE 8192 @@ -30,6 +31,7 @@ */ static int hugetlbfs_test; static size_t mfd_def_size = MFD_DEF_SIZE; +static const char *memfd_str = MEMFD_STR; /* * Copied from mlock2-tests.c @@ -606,7 +608,7 @@ static void test_create(void) char buf[2048]; int fd; - printf("%s CREATE\n", MEMFD_STR); + printf("%s CREATE\n", memfd_str); /* test NULL name */ mfd_fail_new(NULL, 0); @@ -652,7 +654,7 @@ static void test_basic(void) { int fd; - printf("%s BASIC\n", MEMFD_STR); + printf("%s BASIC\n", memfd_str); fd = mfd_assert_new("kern_memfd_basic", mfd_def_size, @@ -704,7 +706,7 @@ static void test_seal_write(void) { int fd; - printf("%s SEAL-WRITE\n", MEMFD_STR); + printf("%s SEAL-WRITE\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_write", mfd_def_size, @@ -730,7 +732,7 @@ static void test_seal_shrink(void) { int fd; - printf("%s SEAL-SHRINK\n", MEMFD_STR); + printf("%s SEAL-SHRINK\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_shrink", mfd_def_size, @@ -756,7 +758,7 @@ static void test_seal_grow(void) { int fd; - printf("%s SEAL-GROW\n", MEMFD_STR); + printf("%s SEAL-GROW\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_grow", mfd_def_size, @@ -782,7 +784,7 @@ static void test_seal_resize(void) { int fd; - printf("%s SEAL-RESIZE\n", MEMFD_STR); + printf("%s SEAL-RESIZE\n", memfd_str); fd = mfd_assert_new("kern_memfd_seal_resize", mfd_def_size, @@ -808,7 +810,7 @@ static void test_share_dup(char *banner, char *b_suffix) { int fd, fd2; - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_dup", mfd_def_size, @@ -850,7 +852,7 @@ static void test_share_mmap(char *banner, char *b_suffix) int fd; void *p; - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_mmap", mfd_def_size, @@ -884,7 +886,7 @@ static void test_share_open(char *banner, char *b_suffix) { int fd, fd2; - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_open", mfd_def_size, @@ -927,7 +929,7 @@ static void test_share_fork(char *banner, char *b_suffix) int fd; pid_t pid; - printf("%s %s %s\n", MEMFD_STR, banner, b_suffix); + printf("%s %s %s\n", memfd_str, banner, b_suffix); fd = mfd_assert_new("kern_memfd_share_fork", mfd_def_size, @@ -963,7 +965,11 @@ int main(int argc, char **argv) } hugetlbfs_test = 1; + memfd_str = MEMFD_HUGE_STR; mfd_def_size = hpage_size * 2; + } else { + printf("Unknown option: %s\n", argv[1]); + abort(); } } From 29f34d1dd6657a0c7da875deb57775c67ff6bd86 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:40 -0800 Subject: [PATCH 090/118] memfd-test: move common code to a shared unit MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The memfd & fuse tests will share more common code in the following commits to test hugetlb support. Link: http://lkml.kernel.org/r/20171107122800.25517-9-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/Makefile | 5 +++ tools/testing/selftests/memfd/common.c | 46 ++++++++++++++++++++++ tools/testing/selftests/memfd/common.h | 9 +++++ tools/testing/selftests/memfd/fuse_test.c | 8 +--- tools/testing/selftests/memfd/memfd_test.c | 36 +---------------- 5 files changed, 64 insertions(+), 40 deletions(-) create mode 100644 tools/testing/selftests/memfd/common.c create mode 100644 tools/testing/selftests/memfd/common.h diff --git a/tools/testing/selftests/memfd/Makefile b/tools/testing/selftests/memfd/Makefile index 3926a0409dda..a5276a91dfbf 100644 --- a/tools/testing/selftests/memfd/Makefile +++ b/tools/testing/selftests/memfd/Makefile @@ -12,3 +12,8 @@ fuse_mnt.o: CFLAGS += $(shell pkg-config fuse --cflags) include ../lib.mk $(OUTPUT)/fuse_mnt: LDLIBS += $(shell pkg-config fuse --libs) + +$(OUTPUT)/memfd_test: memfd_test.c common.o +$(OUTPUT)/fuse_test: fuse_test.c common.o + +EXTRA_CLEAN = common.o diff --git a/tools/testing/selftests/memfd/common.c b/tools/testing/selftests/memfd/common.c new file mode 100644 index 000000000000..8eb3d75f6e60 --- /dev/null +++ b/tools/testing/selftests/memfd/common.c @@ -0,0 +1,46 @@ +// SPDX-License-Identifier: GPL-2.0 +#define _GNU_SOURCE +#define __EXPORTED_HEADERS__ + +#include +#include +#include +#include +#include +#include + +#include "common.h" + +int hugetlbfs_test = 0; + +/* + * Copied from mlock2-tests.c + */ +unsigned long default_huge_page_size(void) +{ + unsigned long hps = 0; + char *line = NULL; + size_t linelen = 0; + FILE *f = fopen("/proc/meminfo", "r"); + + if (!f) + return 0; + while (getline(&line, &linelen, f) > 0) { + if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { + hps <<= 10; + break; + } + } + + free(line); + fclose(f); + return hps; +} + +int sys_memfd_create(const char *name, unsigned int flags) +{ + if (hugetlbfs_test) + flags |= MFD_HUGETLB; + + return syscall(__NR_memfd_create, name, flags); +} diff --git a/tools/testing/selftests/memfd/common.h b/tools/testing/selftests/memfd/common.h new file mode 100644 index 000000000000..522d2c630bd8 --- /dev/null +++ b/tools/testing/selftests/memfd/common.h @@ -0,0 +1,9 @@ +#ifndef COMMON_H_ +#define COMMON_H_ + +extern int hugetlbfs_test; + +unsigned long default_huge_page_size(void); +int sys_memfd_create(const char *name, unsigned int flags); + +#endif diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 1ccb7a3eb14b..795a25ba8521 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -33,15 +33,11 @@ #include #include +#include "common.h" + #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 -static int sys_memfd_create(const char *name, - unsigned int flags) -{ - return syscall(__NR_memfd_create, name, flags); -} - static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { int r, fd; diff --git a/tools/testing/selftests/memfd/memfd_test.c b/tools/testing/selftests/memfd/memfd_test.c index 910c55f858bb..10baa1652fc2 100644 --- a/tools/testing/selftests/memfd/memfd_test.c +++ b/tools/testing/selftests/memfd/memfd_test.c @@ -19,6 +19,8 @@ #include #include +#include "common.h" + #define MEMFD_STR "memfd:" #define MEMFD_HUGE_STR "memfd-hugetlb:" #define SHARED_FT_STR "(shared file-table)" @@ -29,43 +31,9 @@ /* * Default is not to test hugetlbfs */ -static int hugetlbfs_test; static size_t mfd_def_size = MFD_DEF_SIZE; static const char *memfd_str = MEMFD_STR; -/* - * Copied from mlock2-tests.c - */ -static unsigned long default_huge_page_size(void) -{ - unsigned long hps = 0; - char *line = NULL; - size_t linelen = 0; - FILE *f = fopen("/proc/meminfo", "r"); - - if (!f) - return 0; - while (getline(&line, &linelen, f) > 0) { - if (sscanf(line, "Hugepagesize: %lu kB", &hps) == 1) { - hps <<= 10; - break; - } - } - - free(line); - fclose(f); - return hps; -} - -static int sys_memfd_create(const char *name, - unsigned int flags) -{ - if (hugetlbfs_test) - flags |= MFD_HUGETLB; - - return syscall(__NR_memfd_create, name, flags); -} - static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { int r, fd; From c5c63835e5713b09fc974241db47956362a63efa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marc-Andr=C3=A9=20Lureau?= Date: Wed, 31 Jan 2018 16:19:44 -0800 Subject: [PATCH 091/118] memfd-test: run fuse test on hugetlb backend memory MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Link: http://lkml.kernel.org/r/20171107122800.25517-10-marcandre.lureau@redhat.com Signed-off-by: Marc-André Lureau Suggested-by: Mike Kravetz Reviewed-by: Mike Kravetz Cc: Andrea Arcangeli Cc: Hugh Dickins Cc: Michal Hocko Cc: David Herrmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/memfd/fuse_test.c | 38 +++++++++++++++---- .../testing/selftests/memfd/run_fuse_test.sh | 2 +- tools/testing/selftests/memfd/run_tests.sh | 1 + 3 files changed, 32 insertions(+), 9 deletions(-) diff --git a/tools/testing/selftests/memfd/fuse_test.c b/tools/testing/selftests/memfd/fuse_test.c index 795a25ba8521..b018e835737d 100644 --- a/tools/testing/selftests/memfd/fuse_test.c +++ b/tools/testing/selftests/memfd/fuse_test.c @@ -38,6 +38,8 @@ #define MFD_DEF_SIZE 8192 #define STACK_SIZE 65536 +static size_t mfd_def_size = MFD_DEF_SIZE; + static int mfd_assert_new(const char *name, loff_t sz, unsigned int flags) { int r, fd; @@ -123,7 +125,7 @@ static void *mfd_assert_mmap_shared(int fd) void *p; p = mmap(NULL, - MFD_DEF_SIZE, + mfd_def_size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, @@ -141,7 +143,7 @@ static void *mfd_assert_mmap_private(int fd) void *p; p = mmap(NULL, - MFD_DEF_SIZE, + mfd_def_size, PROT_READ | PROT_WRITE, MAP_PRIVATE, fd, @@ -174,7 +176,7 @@ static int sealing_thread_fn(void *arg) usleep(200000); /* unmount mapping before sealing to avoid i_mmap_writable failures */ - munmap(global_p, MFD_DEF_SIZE); + munmap(global_p, mfd_def_size); /* Try sealing the global file; expect EBUSY or success. Current * kernels will never succeed, but in the future, kernels might @@ -224,7 +226,7 @@ static void join_sealing_thread(pid_t pid) int main(int argc, char **argv) { - static const char zero[MFD_DEF_SIZE]; + char *zero; int fd, mfd, r; void *p; int was_sealed; @@ -235,6 +237,25 @@ int main(int argc, char **argv) abort(); } + if (argc >= 3) { + if (!strcmp(argv[2], "hugetlbfs")) { + unsigned long hpage_size = default_huge_page_size(); + + if (!hpage_size) { + printf("Unable to determine huge page size\n"); + abort(); + } + + hugetlbfs_test = 1; + mfd_def_size = hpage_size * 2; + } else { + printf("Unknown option: %s\n", argv[2]); + abort(); + } + } + + zero = calloc(sizeof(*zero), mfd_def_size); + /* open FUSE memfd file for GUP testing */ printf("opening: %s\n", argv[1]); fd = open(argv[1], O_RDONLY | O_CLOEXEC); @@ -245,7 +266,7 @@ int main(int argc, char **argv) /* create new memfd-object */ mfd = mfd_assert_new("kern_memfd_fuse", - MFD_DEF_SIZE, + mfd_def_size, MFD_CLOEXEC | MFD_ALLOW_SEALING); /* mmap memfd-object for writing */ @@ -264,7 +285,7 @@ int main(int argc, char **argv) * This guarantees that the receive-buffer is pinned for 1s until the * data is written into it. The racing ADD_SEALS should thus fail as * the pages are still pinned. */ - r = read(fd, p, MFD_DEF_SIZE); + r = read(fd, p, mfd_def_size); if (r < 0) { printf("read() failed: %m\n"); abort(); @@ -291,10 +312,10 @@ int main(int argc, char **argv) * enough to avoid any in-flight writes. */ p = mfd_assert_mmap_private(mfd); - if (was_sealed && memcmp(p, zero, MFD_DEF_SIZE)) { + if (was_sealed && memcmp(p, zero, mfd_def_size)) { printf("memfd sealed during read() but data not discarded\n"); abort(); - } else if (!was_sealed && !memcmp(p, zero, MFD_DEF_SIZE)) { + } else if (!was_sealed && !memcmp(p, zero, mfd_def_size)) { printf("memfd sealed after read() but data discarded\n"); abort(); } @@ -303,6 +324,7 @@ int main(int argc, char **argv) close(fd); printf("fuse: DONE\n"); + free(zero); return 0; } diff --git a/tools/testing/selftests/memfd/run_fuse_test.sh b/tools/testing/selftests/memfd/run_fuse_test.sh index 407df68dfe27..22e572e2d66a 100755 --- a/tools/testing/selftests/memfd/run_fuse_test.sh +++ b/tools/testing/selftests/memfd/run_fuse_test.sh @@ -10,6 +10,6 @@ set -e mkdir mnt ./fuse_mnt ./mnt -./fuse_test ./mnt/memfd +./fuse_test ./mnt/memfd $@ fusermount -u ./mnt rmdir ./mnt diff --git a/tools/testing/selftests/memfd/run_tests.sh b/tools/testing/selftests/memfd/run_tests.sh index daabb350697c..c2d41ed81b24 100755 --- a/tools/testing/selftests/memfd/run_tests.sh +++ b/tools/testing/selftests/memfd/run_tests.sh @@ -60,6 +60,7 @@ fi # Run the hugetlbfs test # ./memfd_test hugetlbfs +./run_fuse_test.sh hugetlbfs # # Give back any huge pages allocated for the test From 284cd241a18ee6d999296f8ff3104eb6d2fc898f Mon Sep 17 00:00:00 2001 From: Eric Biggers Date: Wed, 31 Jan 2018 16:19:48 -0800 Subject: [PATCH 092/118] userfaultfd: convert to use anon_inode_getfd() Nothing actually calls userfaultfd_file_create() besides the userfaultfd() system call itself. So simplify things by folding it into the system call and using anon_inode_getfd() instead of anon_inode_getfile(). Do the same in resolve_userfault_fork() as well. This removes over 50 lines with no change in functionality. Link: http://lkml.kernel.org/r/20171229212403.22800-1-ebiggers3@gmail.com Signed-off-by: Eric Biggers Reviewed-by: Mike Rapoport Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 70 +++++++----------------------------------------- 1 file changed, 9 insertions(+), 61 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index a9d0ddc12ace..87a13a7c8270 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -988,24 +988,14 @@ static int resolve_userfault_fork(struct userfaultfd_ctx *ctx, struct uffd_msg *msg) { int fd; - struct file *file; - unsigned int flags = new->flags & UFFD_SHARED_FCNTL_FLAGS; - fd = get_unused_fd_flags(flags); + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new, + O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS)); if (fd < 0) return fd; - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, new, - O_RDWR | flags); - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - - fd_install(fd, file); msg->arg.reserved.reserved1 = 0; msg->arg.fork.ufd = fd; - return 0; } @@ -1887,24 +1877,10 @@ static void init_once_userfaultfd_ctx(void *mem) seqcount_init(&ctx->refile_seq); } -/** - * userfaultfd_file_create - Creates a userfaultfd file pointer. - * @flags: Flags for the userfaultfd file. - * - * This function creates a userfaultfd file pointer, w/out installing - * it into the fd table. This is useful when the userfaultfd file is - * used during the initialization of data structures that require - * extra setup after the userfaultfd creation. So the userfaultfd - * creation is split into the file pointer creation phase, and the - * file descriptor installation phase. In this way races with - * userspace closing the newly installed file descriptor can be - * avoided. Returns a userfaultfd file pointer, or a proper error - * pointer. - */ -static struct file *userfaultfd_file_create(int flags) +SYSCALL_DEFINE1(userfaultfd, int, flags) { - struct file *file; struct userfaultfd_ctx *ctx; + int fd; BUG_ON(!current->mm); @@ -1912,14 +1888,12 @@ static struct file *userfaultfd_file_create(int flags) BUILD_BUG_ON(UFFD_CLOEXEC != O_CLOEXEC); BUILD_BUG_ON(UFFD_NONBLOCK != O_NONBLOCK); - file = ERR_PTR(-EINVAL); if (flags & ~UFFD_SHARED_FCNTL_FLAGS) - goto out; + return -EINVAL; - file = ERR_PTR(-ENOMEM); ctx = kmem_cache_alloc(userfaultfd_ctx_cachep, GFP_KERNEL); if (!ctx) - goto out; + return -ENOMEM; atomic_set(&ctx->refcount, 1); ctx->flags = flags; @@ -1930,39 +1904,13 @@ static struct file *userfaultfd_file_create(int flags) /* prevent the mm struct to be freed */ mmgrab(ctx->mm); - file = anon_inode_getfile("[userfaultfd]", &userfaultfd_fops, ctx, - O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); - if (IS_ERR(file)) { + fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx, + O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS)); + if (fd < 0) { mmdrop(ctx->mm); kmem_cache_free(userfaultfd_ctx_cachep, ctx); } -out: - return file; -} - -SYSCALL_DEFINE1(userfaultfd, int, flags) -{ - int fd, error; - struct file *file; - - error = get_unused_fd_flags(flags & UFFD_SHARED_FCNTL_FLAGS); - if (error < 0) - return error; - fd = error; - - file = userfaultfd_file_create(flags); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_put_unused_fd; - } - fd_install(fd, file); - return fd; - -err_put_unused_fd: - put_unused_fd(fd); - - return error; } static int __init userfaultfd_init(void) From 69d763fc6d3aee787a3e8c8c35092b4f4960fa5d Mon Sep 17 00:00:00 2001 From: Mel Gorman Date: Wed, 31 Jan 2018 16:19:52 -0800 Subject: [PATCH 093/118] mm: pin address_space before dereferencing it while isolating an LRU page Minchan Kim asked the following question -- what locks protects address_space destroying when race happens between inode trauncation and __isolate_lru_page? Jan Kara clarified by describing the race as follows CPU1 CPU2 truncate(inode) __isolate_lru_page() ... truncate_inode_page(mapping, page); delete_from_page_cache(page) spin_lock_irqsave(&mapping->tree_lock, flags); __delete_from_page_cache(page, NULL) page_cache_tree_delete(..) ... mapping = page_mapping(page); page->mapping = NULL; ... spin_unlock_irqrestore(&mapping->tree_lock, flags); page_cache_free_page(mapping, page) put_page(page) if (put_page_testzero(page)) -> false - inode now has no pages and can be freed including embedded address_space if (mapping && !mapping->a_ops->migratepage) - we've dereferenced mapping which is potentially already free. The race is theoretically possible but unlikely. Before the delete_from_page_cache, truncate_cleanup_page is called so the page is likely to be !PageDirty or PageWriteback which gets skipped by the only caller that checks the mappping in __isolate_lru_page. Even if the race occurs, a substantial amount of work has to happen during a tiny window with no preemption but it could potentially be done using a virtual machine to artifically slow one CPU or halt it during the critical window. This patch should eliminate the race with truncation by try-locking the page before derefencing mapping and aborting if the lock was not acquired. There was a suggestion from Huang Ying to use RCU as a side-effect to prevent mapping being freed. However, I do not like the solution as it's an unconventional means of preserving a mapping and it's not a context where rcu_read_lock is obviously protecting rcu data. Link: http://lkml.kernel.org/r/20180104102512.2qos3h5vqzeisrek@techsingularity.net Fixes: c82449352854 ("mm: compaction: make isolate_lru_page() filter-aware again") Signed-off-by: Mel Gorman Acked-by: Minchan Kim Cc: "Huang, Ying" Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmscan.c | 14 ++++++++++++-- 1 file changed, 12 insertions(+), 2 deletions(-) diff --git a/mm/vmscan.c b/mm/vmscan.c index 1a33c8e1e758..fdd3fc6be862 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -1415,14 +1415,24 @@ int __isolate_lru_page(struct page *page, isolate_mode_t mode) if (PageDirty(page)) { struct address_space *mapping; + bool migrate_dirty; /* * Only pages without mappings or that have a * ->migratepage callback are possible to migrate - * without blocking + * without blocking. However, we can be racing with + * truncation so it's necessary to lock the page + * to stabilise the mapping as truncation holds + * the page lock until after the page is removed + * from the page cache. */ + if (!trylock_page(page)) + return ret; + mapping = page_mapping(page); - if (mapping && !mapping->a_ops->migratepage) + migrate_dirty = mapping && mapping->a_ops->migratepage; + unlock_page(page); + if (!migrate_dirty) return ret; } } From a7ab400d6fe73d0119fdc234e9982a6f80faea9f Mon Sep 17 00:00:00 2001 From: "shidao.ytt" Date: Wed, 31 Jan 2018 16:19:55 -0800 Subject: [PATCH 094/118] mm/fadvise: discard partial page if endbyte is also EOF During our recent testing with fadvise(FADV_DONTNEED), we find that if given offset/length is not page-aligned, the last page will not be discarded. The tool we use is vmtouch (https://hoytech.com/vmtouch/), we map a 10KB-sized file into memory and then try to run this tool to evict the whole file mapping, but the last single page always remains staying in the memory: $./vmtouch -e test_10K Files: 1 Directories: 0 Evicted Pages: 3 (12K) Elapsed: 2.1e-05 seconds $./vmtouch test_10K Files: 1 Directories: 0 Resident Pages: 1/3 4K/12K 33.3% Elapsed: 5.5e-05 seconds However when we test with an older kernel, say 3.10, this problem is gone. So we wonder if this is a regression: $./vmtouch -e test_10K Files: 1 Directories: 0 Evicted Pages: 3 (12K) Elapsed: 8.2e-05 seconds $./vmtouch test_10K Files: 1 Directories: 0 Resident Pages: 0/3 0/12K 0% <-- partial page also discarded Elapsed: 5e-05 seconds After digging a little bit into this problem, we find it seems not a regression. Not discarding partial page is likely to be on purpose according to commit 441c228f817f ("mm: fadvise: document the fadvise(FADV_DONTNEED) behaviour for partial pages") written by Mel Gorman. He explained why partial pages should be preserved instead of being discarded when using fadvise(FADV_DONTNEED). However, the interesting part is that the actual code did NOT work as the same as it was described, the partial page was still discarded anyway, due to a calculation mistake of `end_index' passed to invalidate_mapping_pages(). This mistake has not been fixed until recently, that's why we fail to reproduce our problem in old kernels. The fix is done in commit 18aba41cbf ("mm/fadvise.c: do not discard partial pages with POSIX_FADV_DONTNEED") by Oleg Drokin. Back to the original testing, our problem becomes that there is a special case that, if the page-unaligned `endbyte' is also the end of file, it is not necessary at all to preserve the last partial page, as we all know no one else will use the rest of it. It should be safe enough if we just discard the whole page. So we add an EOF check in this patch. We also find a poosbile real world issue in mainline kernel. Assume such scenario: A userspace backup application want to backup a huge amount of small files (<4k) at once, the developer might (I guess) want to use fadvise(FADV_DONTNEED) to save memory. However, FADV_DONTNEED won't really happen since the only page mapped is a partial page, and kernel will preserve it. Our patch also fixes this problem, since we know the endbyte is EOF, so we discard it. Here is a simple reproducer to reproduce and verify each scenario we described above: test_fadvise.c ============================== #include #include #include #include #include #include #include int main(int argc, char **argv) { int i, fd, ret, len; struct stat buf; void *addr; unsigned char *vec; char *strbuf; ssize_t pagesize = getpagesize(); ssize_t filesize; fd = open(argv[1], O_RDWR|O_CREAT, S_IRUSR|S_IWUSR); if (fd < 0) return -1; filesize = strtoul(argv[2], NULL, 10); strbuf = malloc(filesize); memset(strbuf, 42, filesize); write(fd, strbuf, filesize); free(strbuf); fsync(fd); len = (filesize + pagesize - 1) / pagesize; printf("length of pages: %d\n", len); addr = mmap(NULL, filesize, PROT_READ, MAP_SHARED, fd, 0); if (addr == MAP_FAILED) return -1; ret = posix_fadvise(fd, 0, filesize, POSIX_FADV_DONTNEED); if (ret < 0) return -1; vec = malloc(len); ret = mincore(addr, filesize, (void *)vec); if (ret < 0) return -1; for (i = 0; i < len; i++) printf("pages[%d]: %x\n", i, vec[i] & 0x1); free(vec); close(fd); return 0; } ============================== Test 1: running on kernel with commit 18aba41cbf reverted: [root@caspar ~]# uname -r 4.15.0-rc6.revert+ [root@caspar ~]# ./test_fadvise file1 1024 length of pages: 1 pages[0]: 0 # <-- partial page discarded [root@caspar ~]# ./test_fadvise file2 8192 length of pages: 2 pages[0]: 0 pages[1]: 0 [root@caspar ~]# ./test_fadvise file3 10240 length of pages: 3 pages[0]: 0 pages[1]: 0 pages[2]: 0 # <-- partial page discarded Test 2: running on mainline kernel: [root@caspar ~]# uname -r 4.15.0-rc6+ [root@caspar ~]# ./test_fadvise test1 1024 length of pages: 1 pages[0]: 1 # <-- partial and the only page not discarded [root@caspar ~]# ./test_fadvise test2 8192 length of pages: 2 pages[0]: 0 pages[1]: 0 [root@caspar ~]# ./test_fadvise test3 10240 length of pages: 3 pages[0]: 0 pages[1]: 0 pages[2]: 1 # <-- partial page not discarded Test 3: running on kernel with this patch: [root@caspar ~]# uname -r 4.15.0-rc6.patched+ [root@caspar ~]# ./test_fadvise test1 1024 length of pages: 1 pages[0]: 0 # <-- partial page and EOF, discarded [root@caspar ~]# ./test_fadvise test2 8192 length of pages: 2 pages[0]: 0 pages[1]: 0 [root@caspar ~]# ./test_fadvise test3 10240 length of pages: 3 pages[0]: 0 pages[1]: 0 pages[2]: 0 # <-- partial page and EOF, discarded [akpm@linux-foundation.org: tweak code comment] Link: http://lkml.kernel.org/r/5222da9ee20e1695eaabb69f631f200d6e6b8876.1515132470.git.jinli.zjl@alibaba-inc.com Signed-off-by: shidao.ytt Signed-off-by: Caspar Zhang Reviewed-by: Oliver Yang Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/fadvise.c | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/mm/fadvise.c b/mm/fadvise.c index ec70d6e4b86d..767887f5f3bf 100644 --- a/mm/fadvise.c +++ b/mm/fadvise.c @@ -127,7 +127,15 @@ SYSCALL_DEFINE4(fadvise64_64, int, fd, loff_t, offset, loff_t, len, int, advice) */ start_index = (offset+(PAGE_SIZE-1)) >> PAGE_SHIFT; end_index = (endbyte >> PAGE_SHIFT); - if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK) { + /* + * The page at end_index will be inclusively discarded according + * by invalidate_mapping_pages(), so subtracting 1 from + * end_index means we will skip the last page. But if endbyte + * is page aligned or is at the end of file, we should not skip + * that page - discarding the last page is safe enough. + */ + if ((endbyte & ~PAGE_MASK) != ~PAGE_MASK && + endbyte != inode->i_size - 1) { /* First page is tricky as 0 - 1 = -1, but pgoff_t * is unsigned, so the end_index >= start_index * check below would be true and we'll discard the whole From 9c3760eb80880f3e02546e0a2ef479e1454986b3 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Jan 2018 16:19:59 -0800 Subject: [PATCH 095/118] zswap: only save zswap header when necessary We waste sizeof(swp_entry_t) for zswap header when using zsmalloc as zpool driver because zsmalloc doesn't support eviction. Add zpool_evictable() to detect if zpool is potentially evictable, and use it in zswap to avoid waste memory for zswap header. [yuzhao@google.com: The zpool->" prefix is a result of copy & paste] Link: http://lkml.kernel.org/r/20180110225626.110330-1-yuzhao@google.com Link: http://lkml.kernel.org/r/20180110224741.83751-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Dan Streetman Reviewed-by: Sergey Senozhatsky Cc: Seth Jennings Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/zpool.h | 2 ++ mm/zpool.c | 25 +++++++++++++++++++++++-- mm/zsmalloc.c | 7 ------- mm/zswap.c | 20 ++++++++++---------- 4 files changed, 35 insertions(+), 19 deletions(-) diff --git a/include/linux/zpool.h b/include/linux/zpool.h index 004ba807df96..7238865e75b0 100644 --- a/include/linux/zpool.h +++ b/include/linux/zpool.h @@ -108,4 +108,6 @@ void zpool_register_driver(struct zpool_driver *driver); int zpool_unregister_driver(struct zpool_driver *driver); +bool zpool_evictable(struct zpool *pool); + #endif diff --git a/mm/zpool.c b/mm/zpool.c index fd3ff719c32c..e1e7aa6d1d06 100644 --- a/mm/zpool.c +++ b/mm/zpool.c @@ -21,6 +21,7 @@ struct zpool { struct zpool_driver *driver; void *pool; const struct zpool_ops *ops; + bool evictable; struct list_head list; }; @@ -142,7 +143,7 @@ EXPORT_SYMBOL(zpool_has_pool); * * This creates a new zpool of the specified type. The gfp flags will be * used when allocating memory, if the implementation supports it. If the - * ops param is NULL, then the created zpool will not be shrinkable. + * ops param is NULL, then the created zpool will not be evictable. * * Implementations must guarantee this to be thread-safe. * @@ -180,6 +181,7 @@ struct zpool *zpool_create_pool(const char *type, const char *name, gfp_t gfp, zpool->driver = driver; zpool->pool = driver->create(name, gfp, ops, zpool); zpool->ops = ops; + zpool->evictable = driver->shrink && ops && ops->evict; if (!zpool->pool) { pr_err("couldn't create %s pool\n", type); @@ -296,7 +298,8 @@ void zpool_free(struct zpool *zpool, unsigned long handle) int zpool_shrink(struct zpool *zpool, unsigned int pages, unsigned int *reclaimed) { - return zpool->driver->shrink(zpool->pool, pages, reclaimed); + return zpool->driver->shrink ? + zpool->driver->shrink(zpool->pool, pages, reclaimed) : -EINVAL; } /** @@ -355,6 +358,24 @@ u64 zpool_get_total_size(struct zpool *zpool) return zpool->driver->total_size(zpool->pool); } +/** + * zpool_evictable() - Test if zpool is potentially evictable + * @pool The zpool to test + * + * Zpool is only potentially evictable when it's created with struct + * zpool_ops.evict and its driver implements struct zpool_driver.shrink. + * + * However, it doesn't necessarily mean driver will use zpool_ops.evict + * in its implementation of zpool_driver.shrink. It could do internal + * defragmentation instead. + * + * Returns: true if potentially evictable; false otherwise. + */ +bool zpool_evictable(struct zpool *zpool) +{ + return zpool->evictable; +} + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Dan Streetman "); MODULE_DESCRIPTION("Common API for compressed memory storage"); diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index e136a8e72c48..f797d8b0d820 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -404,12 +404,6 @@ static void zs_zpool_free(void *pool, unsigned long handle) zs_free(pool, handle); } -static int zs_zpool_shrink(void *pool, unsigned int pages, - unsigned int *reclaimed) -{ - return -EINVAL; -} - static void *zs_zpool_map(void *pool, unsigned long handle, enum zpool_mapmode mm) { @@ -447,7 +441,6 @@ static struct zpool_driver zs_zpool_driver = { .destroy = zs_zpool_destroy, .malloc = zs_zpool_malloc, .free = zs_zpool_free, - .shrink = zs_zpool_shrink, .map = zs_zpool_map, .unmap = zs_zpool_unmap, .total_size = zs_zpool_total_size, diff --git a/mm/zswap.c b/mm/zswap.c index 1133b4ceb72e..c004aa4fd3f4 100644 --- a/mm/zswap.c +++ b/mm/zswap.c @@ -1001,11 +1001,11 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, struct zswap_entry *entry, *dupentry; struct crypto_comp *tfm; int ret; - unsigned int dlen = PAGE_SIZE, len; + unsigned int hlen, dlen = PAGE_SIZE; unsigned long handle, value; char *buf; u8 *src, *dst; - struct zswap_header *zhdr; + struct zswap_header zhdr = { .swpentry = swp_entry(type, offset) }; if (!zswap_enabled || !tree) { ret = -ENODEV; @@ -1063,8 +1063,8 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, } /* store */ - len = dlen + sizeof(struct zswap_header); - ret = zpool_malloc(entry->pool->zpool, len, + hlen = zpool_evictable(entry->pool->zpool) ? sizeof(zhdr) : 0; + ret = zpool_malloc(entry->pool->zpool, hlen + dlen, __GFP_NORETRY | __GFP_NOWARN | __GFP_KSWAPD_RECLAIM, &handle); if (ret == -ENOSPC) { @@ -1075,10 +1075,9 @@ static int zswap_frontswap_store(unsigned type, pgoff_t offset, zswap_reject_alloc_fail++; goto put_dstmem; } - zhdr = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); - zhdr->swpentry = swp_entry(type, offset); - buf = (u8 *)(zhdr + 1); - memcpy(buf, dst, dlen); + buf = zpool_map_handle(entry->pool->zpool, handle, ZPOOL_MM_RW); + memcpy(buf, &zhdr, hlen); + memcpy(buf + hlen, dst, dlen); zpool_unmap_handle(entry->pool->zpool, handle); put_cpu_var(zswap_dstmem); @@ -1149,8 +1148,9 @@ static int zswap_frontswap_load(unsigned type, pgoff_t offset, /* decompress */ dlen = PAGE_SIZE; - src = (u8 *)zpool_map_handle(entry->pool->zpool, entry->handle, - ZPOOL_MM_RO) + sizeof(struct zswap_header); + src = zpool_map_handle(entry->pool->zpool, entry->handle, ZPOOL_MM_RO); + if (zpool_evictable(entry->pool->zpool)) + src += sizeof(struct zswap_header); dst = kmap_atomic(page); tfm = *get_cpu_ptr(entry->pool->tfm); ret = crypto_comp_decompress(tfm, src, entry->length, dst, &dlen); From c054a78c66c7a5aa218220d8949ebcf13a86b796 Mon Sep 17 00:00:00 2001 From: Yu Zhao Date: Wed, 31 Jan 2018 16:20:02 -0800 Subject: [PATCH 096/118] memcg: refactor mem_cgroup_resize_limit() mem_cgroup_resize_limit() and mem_cgroup_resize_memsw_limit() have identical logics. Refactor code so we don't need to keep two pieces of code that does same thing. Link: http://lkml.kernel.org/r/20180108224238.14583-1-yuzhao@google.com Signed-off-by: Yu Zhao Acked-by: Vladimir Davydov Acked-by: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 77 +++++++++++-------------------------------------- 1 file changed, 17 insertions(+), 60 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 51d398f1363c..695d9f10906e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2461,13 +2461,15 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit) + unsigned long limit, bool memsw) { unsigned long curusage; unsigned long oldusage; bool enlarge = false; int retry_count; int ret; + bool limits_invariant; + struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; /* * For keeping hierarchical_reclaim simple, how long we should retry @@ -2477,7 +2479,7 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, retry_count = MEM_CGROUP_RECLAIM_RETRIES * mem_cgroup_count_children(memcg); - oldusage = page_counter_read(&memcg->memory); + oldusage = page_counter_read(counter); do { if (signal_pending(current)) { @@ -2486,73 +2488,28 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, } mutex_lock(&memcg_limit_mutex); - if (limit > memcg->memsw.limit) { + /* + * Make sure that the new limit (memsw or memory limit) doesn't + * break our basic invariant rule memory.limit <= memsw.limit. + */ + limits_invariant = memsw ? limit >= memcg->memory.limit : + limit <= memcg->memsw.limit; + if (!limits_invariant) { mutex_unlock(&memcg_limit_mutex); ret = -EINVAL; break; } - if (limit > memcg->memory.limit) + if (limit > counter->limit) enlarge = true; - ret = page_counter_limit(&memcg->memory, limit); + ret = page_counter_limit(counter, limit); mutex_unlock(&memcg_limit_mutex); if (!ret) break; - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, true); + try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw); - curusage = page_counter_read(&memcg->memory); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); - - if (!ret && enlarge) - memcg_oom_recover(memcg); - - return ret; -} - -static int mem_cgroup_resize_memsw_limit(struct mem_cgroup *memcg, - unsigned long limit) -{ - unsigned long curusage; - unsigned long oldusage; - bool enlarge = false; - int retry_count; - int ret; - - /* see mem_cgroup_resize_res_limit */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(&memcg->memsw); - - do { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - - mutex_lock(&memcg_limit_mutex); - if (limit < memcg->memory.limit) { - mutex_unlock(&memcg_limit_mutex); - ret = -EINVAL; - break; - } - if (limit > memcg->memsw.limit) - enlarge = true; - ret = page_counter_limit(&memcg->memsw, limit); - mutex_unlock(&memcg_limit_mutex); - - if (!ret) - break; - - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, false); - - curusage = page_counter_read(&memcg->memsw); + curusage = page_counter_read(counter); /* Usage is reduced ? */ if (curusage >= oldusage) retry_count--; @@ -3014,10 +2971,10 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_memsw_limit(memcg, nr_pages); + ret = mem_cgroup_resize_limit(memcg, nr_pages, true); break; case _KMEM: ret = memcg_update_kmem_limit(memcg, nr_pages); From 3c2c648842843326f8c6ace425810eb47864c6b4 Mon Sep 17 00:00:00 2001 From: Shile Zhang Date: Wed, 31 Jan 2018 16:20:07 -0800 Subject: [PATCH 097/118] mm/page_alloc.c: fix typos in comments Link: http://lkml.kernel.org/r/1515485774-4768-1-git-send-email-zhangshile@gmail.com Signed-off-by: Shile Zhang Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b411f97dfb25..a6972750e7c5 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -293,7 +293,7 @@ int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT /* - * Determine how many pages need to be initialized durig early boot + * Determine how many pages need to be initialized during early boot * (non-deferred initialization). * The value of first_deferred_pfn will be set later, once non-deferred pages * are initialized, but for now set it ULONG_MAX. @@ -344,7 +344,7 @@ static inline bool update_defer_init(pg_data_t *pgdat, unsigned long pfn, unsigned long zone_end, unsigned long *nr_initialised) { - /* Always populate low zones for address-contrained allocations */ + /* Always populate low zones for address-constrained allocations */ if (zone_end < pgdat_end_pfn(pgdat)) return true; (*nr_initialised)++; @@ -3397,7 +3397,7 @@ __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order, if (gfp_mask & __GFP_THISNODE) goto out; - /* Exhausted what can be done so it's blamo time */ + /* Exhausted what can be done so it's blame time */ if (out_of_memory(&oc) || WARN_ON_ONCE(gfp_mask & __GFP_NOFAIL)) { *did_some_progress = 1; From 6787c1dab1724ca0d92110d83485c8c72dbf83f4 Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:20:11 -0800 Subject: [PATCH 098/118] mm/page_owner.c: clean up init_pages_in_zone() Remove two redundant assignments in init_pages_in_zone(). [osalvador@techadventures.net: v3] Link: http://lkml.kernel.org/r/20180117124513.GA876@techadventures.net [akpm@linux-foundation.org: coding style tweaks] Link: http://lkml.kernel.org/r/20180110084355.GA22822@techadventures.net Signed-off-by: Oscar Salvador Acked-by: Michal Hocko Acked-by: Vlastimil Babka Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_owner.c | 16 +++++++--------- 1 file changed, 7 insertions(+), 9 deletions(-) diff --git a/mm/page_owner.c b/mm/page_owner.c index 06a0055f45a6..9886c6073828 100644 --- a/mm/page_owner.c +++ b/mm/page_owner.c @@ -528,21 +528,18 @@ read_page_owner(struct file *file, char __user *buf, size_t count, loff_t *ppos) static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) { - struct page *page; - struct page_ext *page_ext; - unsigned long pfn = zone->zone_start_pfn, block_end_pfn; - unsigned long end_pfn = pfn + zone->spanned_pages; + unsigned long pfn = zone->zone_start_pfn; + unsigned long end_pfn = zone_end_pfn(zone); unsigned long count = 0; - /* Scan block by block. First and last block may be incomplete */ - pfn = zone->zone_start_pfn; - /* * Walk the zone in pageblock_nr_pages steps. If a page block spans * a zone boundary, it will be double counted between zones. This does * not matter as the mixed block count will still be correct */ for (; pfn < end_pfn; ) { + unsigned long block_end_pfn; + if (!pfn_valid(pfn)) { pfn = ALIGN(pfn + 1, MAX_ORDER_NR_PAGES); continue; @@ -551,9 +548,10 @@ static void init_pages_in_zone(pg_data_t *pgdat, struct zone *zone) block_end_pfn = ALIGN(pfn + 1, pageblock_nr_pages); block_end_pfn = min(block_end_pfn, end_pfn); - page = pfn_to_page(pfn); - for (; pfn < block_end_pfn; pfn++) { + struct page *page; + struct page_ext *page_ext; + if (!pfn_valid_within(pfn)) continue; From 01a6ad9ac80c9b861f63087f81e696f47b481168 Mon Sep 17 00:00:00 2001 From: Nick Desaulniers Date: Wed, 31 Jan 2018 16:20:15 -0800 Subject: [PATCH 099/118] zsmalloc: use U suffix for negative literals being shifted Fix warning about shifting unsigned literals being undefined behavior. Link: http://lkml.kernel.org/r/1515642078-4259-1-git-send-email-nick.desaulniers@gmail.com Signed-off-by: Nick Desaulniers Suggested-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Cc: Andy Shevchenko Cc: Matthew Wilcox Cc: Nick Desaulniers Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/zsmalloc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/zsmalloc.c b/mm/zsmalloc.c index f797d8b0d820..c3013505c305 100644 --- a/mm/zsmalloc.c +++ b/mm/zsmalloc.c @@ -1047,7 +1047,7 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) * Reset OBJ_TAG_BITS bit to last link to tell * whether it's allocated object or not. */ - link->next = -1 << OBJ_TAG_BITS; + link->next = -1UL << OBJ_TAG_BITS; } kunmap_atomic(vaddr); page = next_page; From 3a45acc0869748d7a650e36377839d849c28a52c Mon Sep 17 00:00:00 2001 From: Oscar Salvador Date: Wed, 31 Jan 2018 16:20:19 -0800 Subject: [PATCH 100/118] mm/page_ext.c: make page_ext_init a noop when CONFIG_PAGE_EXTENSION but nothing uses it static struct page_ext_operations *page_ext_ops[] always contains debug_guardpage_ops, static struct page_ext_operations *page_ext_ops[] = { &debug_guardpage_ops, #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif ... } but for it to work, CONFIG_DEBUG_PAGEALLOC must be enabled first. If someone has CONFIG_PAGE_EXTENSION, but has none of its users, eg: (CONFIG_PAGE_OWNER, CONFIG_DEBUG_PAGEALLOC, CONFIG_IDLE_PAGE_TRACKING), we can shrink page_ext_init() to a simple retq. $ size vmlinux (before patch) text data bss dec hex filename 14356698 5681582 1687748 21726028 14b834c vmlinux $ size vmlinux (after patch) text data bss dec hex filename 14356008 5681538 1687748 21725294 14b806e vmlinux On the other hand, it might does not even make sense, since if someone enables CONFIG_PAGE_EXTENSION, I would expect him to enable also at least one of its users. Link: http://lkml.kernel.org/r/20180105130235.GA21241@techadventures.net Signed-off-by: Oscar Salvador Cc: Michal Hocko Cc: Vlastimil Babka Cc: Jaewon Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_ext.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/mm/page_ext.c b/mm/page_ext.c index 2c16216c29b6..5295ef331165 100644 --- a/mm/page_ext.c +++ b/mm/page_ext.c @@ -59,7 +59,9 @@ */ static struct page_ext_operations *page_ext_ops[] = { +#ifdef CONFIG_DEBUG_PAGEALLOC &debug_guardpage_ops, +#endif #ifdef CONFIG_PAGE_OWNER &page_owner_ops, #endif From 112d2d29fc087d3078f60db220c4f31f25e59cf0 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Wed, 31 Jan 2018 16:20:23 -0800 Subject: [PATCH 101/118] mm/compaction.c: fix comment for try_to_compact_pages() "mode" argument is not used by try_to_compact_pages() and sub functions anymore, it has been replaced by "prio". Fix the comment to explain the use of "prio" argument. Link: http://lkml.kernel.org/r/1515801336-20611-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Acked-by: Vlastimil Babka Cc: Mel Gorman Cc: David Rientjes Cc: Joonsoo Kim Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/compaction.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/compaction.c b/mm/compaction.c index 10cd757f1006..2c8999d027ab 100644 --- a/mm/compaction.c +++ b/mm/compaction.c @@ -1738,7 +1738,7 @@ int sysctl_extfrag_threshold = 500; * @order: The order of the current allocation * @alloc_flags: The allocation flags of the current allocation * @ac: The context of current allocation - * @mode: The migration mode for async, sync light, or sync migration + * @prio: Determines how hard direct compaction should try to succeed * * This is the main entry point for direct page compaction. */ From def9b71ee651a6fee93a10734b94f93a69cdb2d4 Mon Sep 17 00:00:00 2001 From: Petr Tesarik Date: Wed, 31 Jan 2018 16:20:26 -0800 Subject: [PATCH 102/118] include/linux/mmzone.h: fix explanation of lower bits in the SPARSEMEM mem_map pointer The comment is confusing. On the one hand, it refers to 32-bit alignment (struct page alignment on 32-bit platforms), but this would only guarantee that the 2 lowest bits must be zero. On the other hand, it claims that at least 3 bits are available, and 3 bits are actually used. This is not broken, because there is a stronger alignment guarantee, just less obvious. Let's fix the comment to make it clear how many bits are available and why. Although memmap arrays are allocated in various places, the resulting pointer is encoded eventually, so I am adding a BUG_ON() here to enforce at runtime that all expected bits are indeed available. I have also added a BUILD_BUG_ON to check that PFN_SECTION_SHIFT is sufficient, because this part of the calculation can be easily checked at build time. [ptesarik@suse.com: v2] Link: http://lkml.kernel.org/r/20180125100516.589ea6af@ezekiel.suse.cz Link: http://lkml.kernel.org/r/20180119080908.3a662e6f@ezekiel.suse.cz Signed-off-by: Petr Tesarik Acked-by: Michal Hocko Cc: Vlastimil Babka Cc: Mel Gorman Cc: Johannes Weiner Cc: Kemi Wang Cc: YASUAKI ISHIMATSU Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mmzone.h | 12 ++++++++++-- mm/sparse.c | 6 +++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h index 67f2e3c38939..7522a6987595 100644 --- a/include/linux/mmzone.h +++ b/include/linux/mmzone.h @@ -1166,8 +1166,16 @@ extern unsigned long usemap_size(void); /* * We use the lower bits of the mem_map pointer to store - * a little bit of information. There should be at least - * 3 bits here due to 32-bit alignment. + * a little bit of information. The pointer is calculated + * as mem_map - section_nr_to_pfn(pnum). The result is + * aligned to the minimum alignment of the two values: + * 1. All mem_map arrays are page-aligned. + * 2. section_nr_to_pfn() always clears PFN_SECTION_SHIFT + * lowest bits. PFN_SECTION_SHIFT is arch-specific + * (equal SECTION_SIZE_BITS - PAGE_SHIFT), and the + * worst combination is powerpc with 256k pages, + * which results in PFN_SECTION_SHIFT equal 6. + * To sum it up, at least 6 bits are available. */ #define SECTION_MARKED_PRESENT (1UL<<0) #define SECTION_HAS_MEM_MAP (1UL<<1) diff --git a/mm/sparse.c b/mm/sparse.c index 2609aba121e8..6b8b5e91ceef 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -264,7 +264,11 @@ unsigned long __init node_memmap_size_bytes(int nid, unsigned long start_pfn, */ static unsigned long sparse_encode_mem_map(struct page *mem_map, unsigned long pnum) { - return (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + unsigned long coded_mem_map = + (unsigned long)(mem_map - (section_nr_to_pfn(pnum))); + BUILD_BUG_ON(SECTION_MAP_LAST_BIT > (1UL< Date: Wed, 31 Jan 2018 16:20:30 -0800 Subject: [PATCH 103/118] mm/hmm: fix uninitialized use of 'entry' in hmm_vma_walk_pmd() MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The variable 'entry' is used before being initialized in hmm_vma_walk_pmd(). No bad effect (beside performance hit) so !non_swap_entry(0) evaluate to true which trigger a fault as if CPU was trying to access migrated memory and migrate memory back from device memory to regular memory. This function (hmm_vma_walk_pmd()) is called when a device driver tries to populate its own page table. For migrated memory it should not happen as the device driver should already have populated its page table correctly during the migration. Only case I can think of is multi-GPU where a second GPU triggers migration back to regular memory. Again this would just result in a performance hit, nothing bad would happen. Link: http://lkml.kernel.org/r/20180122185759.26286-1-jglisse@redhat.com Signed-off-by: Ralph Campbell Signed-off-by: Jérôme Glisse Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hmm.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/hmm.c b/mm/hmm.c index ea19742a5d60..979211c7ccc8 100644 --- a/mm/hmm.c +++ b/mm/hmm.c @@ -418,7 +418,7 @@ again: } if (!pte_present(pte)) { - swp_entry_t entry; + swp_entry_t entry = pte_to_swp_entry(pte); if (!non_swap_entry(entry)) { if (hmm_vma_walk->fault) @@ -426,8 +426,6 @@ again: continue; } - entry = pte_to_swp_entry(pte); - /* * This is a special swap entry, ignore migration, use * device and report anything else as error. From 8ad6e404efa294b848782cf14f3d298762674e58 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Christopher=20D=C3=ADaz=20Riveros?= Date: Wed, 31 Jan 2018 16:20:33 -0800 Subject: [PATCH 104/118] mm/memcontrol.c: make local symbol static MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fix the following sparse warning: mm/memcontrol.c:1097:14: warning: symbol 'memcg1_stats' was not declared. Should it be static? Link: http://lkml.kernel.org/r/20180118193327.14200-1-chrisadr@gentoo.org Signed-off-by: Christopher Díaz Riveros Cc: Michal Hocko Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 695d9f10906e..3d7a3d02b168 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1095,7 +1095,7 @@ static bool mem_cgroup_wait_acct_move(struct mem_cgroup *memcg) return false; } -unsigned int memcg1_stats[] = { +static const unsigned int memcg1_stats[] = { MEMCG_CACHE, MEMCG_RSS, MEMCG_RSS_HUGE, From 1ab5c05695bd514119a15f74d2e43456fe94b0e5 Mon Sep 17 00:00:00 2001 From: Andrey Ryabinin Date: Wed, 31 Jan 2018 16:20:37 -0800 Subject: [PATCH 105/118] mm/memcontrol.c: try harder to decrease [memory,memsw].limit_in_bytes mem_cgroup_resize_[memsw]_limit() tries to free only 32 (SWAP_CLUSTER_MAX) pages on each iteration. This makes it practically impossible to decrease limit of memory cgroup. Tasks could easily allocate back 32 pages, so we can't reduce memory usage, and once retry_count reaches zero we return -EBUSY. Easy to reproduce the problem by running the following commands: mkdir /sys/fs/cgroup/memory/test echo $$ >> /sys/fs/cgroup/memory/test/tasks cat big_file > /dev/null & sleep 1 && echo $((100*1024*1024)) > /sys/fs/cgroup/memory/test/memory.limit_in_bytes -bash: echo: write error: Device or resource busy Instead of relying on retry_count, keep retrying the reclaim until the desired limit is reached or fail if the reclaim doesn't make any progress or a signal is pending. Link: http://lkml.kernel.org/r/20180119132544.19569-1-aryabinin@virtuozzo.com Signed-off-by: Andrey Ryabinin Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Shakeel Butt Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 42 ++++++------------------------------------ 1 file changed, 6 insertions(+), 36 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3d7a3d02b168..0ae2dc3a1748 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1176,20 +1176,6 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) } } -/* - * This function returns the number of memcg under hierarchy tree. Returns - * 1(self count) if no children. - */ -static int mem_cgroup_count_children(struct mem_cgroup *memcg) -{ - int num = 0; - struct mem_cgroup *iter; - - for_each_mem_cgroup_tree(iter, memcg) - num++; - return num; -} - /* * Return the memory (and swap, if configured) limit for a memcg. */ @@ -2463,24 +2449,11 @@ static DEFINE_MUTEX(memcg_limit_mutex); static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, unsigned long limit, bool memsw) { - unsigned long curusage; - unsigned long oldusage; bool enlarge = false; - int retry_count; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; - /* - * For keeping hierarchical_reclaim simple, how long we should retry - * is depends on callers. We set our retry-count to be function - * of # of children which we should visit in this loop. - */ - retry_count = MEM_CGROUP_RECLAIM_RETRIES * - mem_cgroup_count_children(memcg); - - oldusage = page_counter_read(counter); - do { if (signal_pending(current)) { ret = -EINTR; @@ -2507,15 +2480,12 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, if (!ret) break; - try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw); - - curusage = page_counter_read(counter); - /* Usage is reduced ? */ - if (curusage >= oldusage) - retry_count--; - else - oldusage = curusage; - } while (retry_count); + if (!try_to_free_mem_cgroup_pages(memcg, 1, + GFP_KERNEL, !memsw)) { + ret = -EBUSY; + break; + } + } while (true); if (!ret && enlarge) memcg_oom_recover(memcg); From af0fb9df784174f8cb02c57b33728a6a4f1de9fb Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:41 -0800 Subject: [PATCH 106/118] mm, hugetlb: unify core page allocation accounting and initialization Patch series "mm, hugetlb: allocation API and migration improvements" Motivation: this is a follow up for [3] for the allocation API and [4] for the hugetlb migration. It wasn't really easy to split those into two separate patch series as they share some code. My primary motivation to touch this code is to make the gigantic pages migration working. The giga pages allocation code is just too fragile and hacked into the hugetlb code now. This series tries to move giga pages closer to the first class citizen. We are not there yet but having 5 patches is quite a lot already and it will already make the code much easier to follow. I will come with other changes on top after this sees some review. The first two patches should be trivial to review. The third patch changes the way how we migrate huge pages. Newly allocated pages are a subject of the overcommit check and they participate surplus accounting which is quite unfortunate as the changelog explains. This patch doesn't change anything wrt. giga pages. Patch #4 removes the surplus accounting hack from __alloc_surplus_huge_page. I hope I didn't miss anything there and a deeper review is really due there. Patch #5 finally unifies allocation paths and giga pages shouldn't be any special anymore. There is also some renaming going on as well. This patch (of 6): hugetlb allocator has two entry points to the page allocator - alloc_fresh_huge_page_node - __hugetlb_alloc_buddy_huge_page The two differ very subtly in two aspects. The first one doesn't care about HTLB_BUDDY_* stats and it doesn't initialize the huge page. prep_new_huge_page is not used because it not only initializes hugetlb specific stuff but because it also put_page and releases the page to the hugetlb pool which is not what is required in some contexts. This makes things more complicated than necessary. Simplify things by a) removing the page allocator entry point duplicity and only keep __hugetlb_alloc_buddy_huge_page and b) make prep_new_huge_page more reusable by removing the put_page which moves the page to the allocator pool. All current callers are updated to call put_page explicitly. Later patches will add new callers which won't need it. This patch shouldn't introduce any functional change. Link: http://lkml.kernel.org/r/20180103093213.26329-2-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 61 +++++++++++++++++++++++++--------------------------- 1 file changed, 29 insertions(+), 32 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 4137fb67cd79..a8959667f539 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1157,6 +1157,7 @@ static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) if (page) { prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, nid); + put_page(page); /* free it into the hugepage allocator */ } return page; @@ -1304,7 +1305,6 @@ static void prep_new_huge_page(struct hstate *h, struct page *page, int nid) h->nr_huge_pages++; h->nr_huge_pages_node[nid]++; spin_unlock(&hugetlb_lock); - put_page(page); /* free it into the hugepage allocator */ } static void prep_compound_gigantic_page(struct page *page, unsigned int order) @@ -1381,41 +1381,49 @@ pgoff_t __basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *alloc_fresh_huge_page_node(struct hstate *h, int nid) +static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) { + int order = huge_page_order(h); struct page *page; - page = __alloc_pages_node(nid, - htlb_alloc_mask(h)|__GFP_COMP|__GFP_THISNODE| - __GFP_RETRY_MAYFAIL|__GFP_NOWARN, - huge_page_order(h)); - if (page) { - prep_new_huge_page(h, page, nid); - } + gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; + if (nid == NUMA_NO_NODE) + nid = numa_mem_id(); + page = __alloc_pages_nodemask(gfp_mask, order, nid, nmask); + if (page) + __count_vm_event(HTLB_BUDDY_PGALLOC); + else + __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); return page; } +/* + * Allocates a fresh page to the hugetlb allocator pool in the node interleaved + * manner. + */ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int nr_nodes, node; - int ret = 0; + gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_huge_page_node(h, node); - if (page) { - ret = 1; + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, + node, nodes_allowed); + if (page) break; - } + } - if (ret) - count_vm_event(HTLB_BUDDY_PGALLOC); - else - count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); + if (!page) + return 0; - return ret; + prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* free it into the hugepage allocator */ + + return 1; } /* @@ -1523,17 +1531,6 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, - gfp_t gfp_mask, int nid, nodemask_t *nmask) -{ - int order = huge_page_order(h); - - gfp_mask |= __GFP_COMP|__GFP_RETRY_MAYFAIL|__GFP_NOWARN; - if (nid == NUMA_NO_NODE) - nid = numa_mem_id(); - return __alloc_pages_nodemask(gfp_mask, order, nid, nmask); -} - static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { @@ -1589,11 +1586,9 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, */ h->nr_huge_pages_node[r_nid]++; h->surplus_huge_pages_node[r_nid]++; - __count_vm_event(HTLB_BUDDY_PGALLOC); } else { h->nr_huge_pages--; h->surplus_huge_pages--; - __count_vm_event(HTLB_BUDDY_PGALLOC_FAIL); } spin_unlock(&hugetlb_lock); @@ -2148,6 +2143,8 @@ static void __init gather_bootmem_prealloc(void) prep_compound_huge_page(page, h->order); WARN_ON(PageReserved(page)); prep_new_huge_page(h, page, page_to_nid(page)); + put_page(page); /* free it into the hugepage allocator */ + /* * If we had gigantic hugepages allocated at boot time, we need * to restore the 'stolen' pages to totalram_pages in order to From d9cc948f6fa1c3384037f500e0acd35f03850d15 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:44 -0800 Subject: [PATCH 107/118] mm, hugetlb: integrate giga hugetlb more naturally to the allocation path Gigantic hugetlb pages were ingrown to the hugetlb code as an alien specie with a lot of special casing. The allocation path is not an exception. Unnecessarily so to be honest. It is true that the underlying allocator is different but that is an implementation detail. This patch unifies the hugetlb allocation path that a prepares fresh pool pages. alloc_fresh_gigantic_page basically copies alloc_fresh_huge_page logic so we can move everything there. This will simplify set_max_huge_pages which doesn't have to care about what kind of huge page we allocate. Link: http://lkml.kernel.org/r/20180103093213.26329-3-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 55 +++++++++++++--------------------------------------- 1 file changed, 14 insertions(+), 41 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index a8959667f539..360765156c7c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1106,7 +1106,8 @@ static bool zone_spans_last_pfn(const struct zone *zone, return zone_spans_pfn(zone, last_pfn); } -static struct page *alloc_gigantic_page(int nid, struct hstate *h) +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { unsigned int order = huge_page_order(h); unsigned long nr_pages = 1 << order; @@ -1114,11 +1115,9 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) struct zonelist *zonelist; struct zone *zone; struct zoneref *z; - gfp_t gfp_mask; - gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; zonelist = node_zonelist(nid, gfp_mask); - for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), NULL) { + for_each_zone_zonelist_nodemask(zone, z, zonelist, gfp_zone(gfp_mask), nodemask) { spin_lock_irqsave(&zone->lock, flags); pfn = ALIGN(zone->zone_start_pfn, nr_pages); @@ -1149,42 +1148,13 @@ static struct page *alloc_gigantic_page(int nid, struct hstate *h) static void prep_new_huge_page(struct hstate *h, struct page *page, int nid); static void prep_compound_gigantic_page(struct page *page, unsigned int order); -static struct page *alloc_fresh_gigantic_page_node(struct hstate *h, int nid) -{ - struct page *page; - - page = alloc_gigantic_page(nid, h); - if (page) { - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, nid); - put_page(page); /* free it into the hugepage allocator */ - } - - return page; -} - -static int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) -{ - struct page *page = NULL; - int nr_nodes, node; - - for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = alloc_fresh_gigantic_page_node(h, node); - if (page) - return 1; - } - - return 0; -} - #else /* !CONFIG_ARCH_HAS_GIGANTIC_PAGE */ static inline bool gigantic_page_supported(void) { return false; } +static struct page *alloc_gigantic_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nodemask) { return NULL; } static inline void free_gigantic_page(struct page *page, unsigned int order) { } static inline void destroy_compound_gigantic_page(struct page *page, unsigned int order) { } -static inline int alloc_fresh_gigantic_page(struct hstate *h, - nodemask_t *nodes_allowed) { return 0; } #endif static void update_and_free_page(struct hstate *h, struct page *page) @@ -1410,8 +1380,12 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, - node, nodes_allowed); + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, + node, nodes_allowed); + else + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, + node, nodes_allowed); if (page) break; @@ -1420,6 +1394,8 @@ static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) if (!page) return 0; + if (hstate_is_gigantic(h)) + prep_compound_gigantic_page(page, huge_page_order(h)); prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ @@ -2307,10 +2283,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, /* yield cpu to avoid soft lockup */ cond_resched(); - if (hstate_is_gigantic(h)) - ret = alloc_fresh_gigantic_page(h, nodes_allowed); - else - ret = alloc_fresh_huge_page(h, nodes_allowed); + ret = alloc_fresh_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; From ab5ac90aecf5685eb630c42c396f5f14726b0afd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:48 -0800 Subject: [PATCH 108/118] mm, hugetlb: do not rely on overcommit limit during migration hugepage migration relies on __alloc_buddy_huge_page to get a new page. This has 2 main disadvantages. 1) it doesn't allow to migrate any huge page if the pool is used completely which is not an exceptional case as the pool is static and unused memory is just wasted. 2) it leads to a weird semantic when migration between two numa nodes might increase the pool size of the destination NUMA node while the page is in use. The issue is caused by per NUMA node surplus pages tracking (see free_huge_page). Address both issues by changing the way how we allocate and account pages allocated for migration. Those should temporal by definition. So we mark them that way (we will abuse page flags in the 3rd page) and update free_huge_page to free such pages to the page allocator. Page migration path then just transfers the temporal status from the new page to the old one which will be freed on the last reference. The global surplus count will never change during this path but we still have to be careful when migrating a per-node suprlus page. This is now handled in move_hugetlb_state which is called from the migration path and it copies the hugetlb specific page state and fixes up the accounting when needed Rename __alloc_buddy_huge_page to __alloc_surplus_huge_page to better reflect its purpose. The new allocation routine for the migration path is __alloc_migrate_huge_page. The user visible effect of this patch is that migrated pages are really temporal and they travel between NUMA nodes as per the migration request: Before migration /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 After /sys/devices/system/node/node0/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/nr_hugepages:0 /sys/devices/system/node/node0/hugepages/hugepages-2048kB/surplus_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/free_hugepages:0 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/nr_hugepages:1 /sys/devices/system/node/node1/hugepages/hugepages-2048kB/surplus_hugepages:0 with the previous implementation, both nodes would have nr_hugepages:1 until the page is freed. Link: http://lkml.kernel.org/r/20180103093213.26329-4-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 3 ++ mm/hugetlb.c | 111 ++++++++++++++++++++++++++++++++++------ mm/migrate.c | 3 +- 3 files changed, 99 insertions(+), 18 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 944e6e8bd572..66992348531e 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -119,6 +119,7 @@ long hugetlb_unreserve_pages(struct inode *inode, long start, long end, long freed); bool isolate_huge_page(struct page *page, struct list_head *list); void putback_active_hugepage(struct page *page); +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason); void free_huge_page(struct page *page); void hugetlb_fix_reserve_counts(struct inode *inode); extern struct mutex *hugetlb_fault_mutex_table; @@ -157,6 +158,7 @@ unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot); bool is_hugetlb_entry_migration(pte_t pte); + #else /* !CONFIG_HUGETLB_PAGE */ static inline void reset_vma_resv_huge_pages(struct vm_area_struct *vma) @@ -197,6 +199,7 @@ static inline bool isolate_huge_page(struct page *page, struct list_head *list) return false; } #define putback_active_hugepage(p) do {} while (0) +#define move_hugetlb_state(old, new, reason) do {} while (0) static inline unsigned long hugetlb_change_protection(struct vm_area_struct *vma, unsigned long address, unsigned long end, pgprot_t newprot) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 360765156c7c..f260ffa26363 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -34,6 +34,7 @@ #include #include #include +#include #include "internal.h" int hugetlb_max_hstate __read_mostly; @@ -1219,6 +1220,28 @@ static void clear_page_huge_active(struct page *page) ClearPagePrivate(&page[1]); } +/* + * Internal hugetlb specific page flag. Do not use outside of the hugetlb + * code + */ +static inline bool PageHugeTemporary(struct page *page) +{ + if (!PageHuge(page)) + return false; + + return (unsigned long)page[2].mapping == -1U; +} + +static inline void SetPageHugeTemporary(struct page *page) +{ + page[2].mapping = (void *)-1U; +} + +static inline void ClearPageHugeTemporary(struct page *page) +{ + page[2].mapping = NULL; +} + void free_huge_page(struct page *page) { /* @@ -1253,7 +1276,11 @@ void free_huge_page(struct page *page) if (restore_reserve) h->resv_huge_pages++; - if (h->surplus_huge_pages_node[nid]) { + if (PageHugeTemporary(page)) { + list_del(&page->lru); + ClearPageHugeTemporary(page); + update_and_free_page(h, page); + } else if (h->surplus_huge_pages_node[nid]) { /* remove the page from active list */ list_del(&page->lru); update_and_free_page(h, page); @@ -1507,7 +1534,10 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) return rc; } -static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, +/* + * Allocates a fresh surplus page from the page allocator. + */ +static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page; @@ -1571,6 +1601,28 @@ static struct page *__alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, return page; } +static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, + int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + return NULL; + + page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + if (!page) + return NULL; + + /* + * We do not account these pages as surplus because they are only + * temporary and will be released properly on the last reference + */ + prep_new_huge_page(h, page, page_to_nid(page)); + SetPageHugeTemporary(page); + + return page; +} + /* * Use the VMA's mpolicy to allocate a huge page from the buddy. */ @@ -1585,17 +1637,13 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = __alloc_buddy_huge_page(h, gfp_mask, nid, nodemask); + page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; } -/* - * This allocation function is useful in the context where vma is irrelevant. - * E.g. soft-offlining uses this function because it only cares physical - * address of error page. - */ +/* page migration callback function */ struct page *alloc_huge_page_node(struct hstate *h, int nid) { gfp_t gfp_mask = htlb_alloc_mask(h); @@ -1610,12 +1658,12 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_buddy_huge_page(h, gfp_mask, nid, NULL); + page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL); return page; } - +/* page migration callback function */ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask) { @@ -1633,9 +1681,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } spin_unlock(&hugetlb_lock); - /* No reservations, try to overcommit */ - - return __alloc_buddy_huge_page(h, gfp_mask, preferred_nid, nmask); + return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } /* @@ -1663,7 +1709,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_buddy_huge_page(h, htlb_alloc_mask(h), + page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; @@ -2260,7 +2306,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with __alloc_buddy_huge_page() here and be unable + * We might race with __alloc_surplus_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2303,7 +2349,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * __alloc_buddy_huge_page() is checking the global counter, + * __alloc_surplus_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. @@ -4779,3 +4825,36 @@ void putback_active_hugepage(struct page *page) spin_unlock(&hugetlb_lock); put_page(page); } + +void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) +{ + struct hstate *h = page_hstate(oldpage); + + hugetlb_cgroup_migrate(oldpage, newpage); + set_page_owner_migrate_reason(newpage, reason); + + /* + * transfer temporary state of the new huge page. This is + * reverse to other transitions because the newpage is going to + * be final while the old one will be freed so it takes over + * the temporary status. + * + * Also note that we have to transfer the per-node surplus state + * here as well otherwise the global surplus count will not match + * the per-node's. + */ + if (PageHugeTemporary(newpage)) { + int old_nid = page_to_nid(oldpage); + int new_nid = page_to_nid(newpage); + + SetPageHugeTemporary(oldpage); + ClearPageHugeTemporary(newpage); + + spin_lock(&hugetlb_lock); + if (h->surplus_huge_pages_node[old_nid]) { + h->surplus_huge_pages_node[old_nid]--; + h->surplus_huge_pages_node[new_nid]++; + } + spin_unlock(&hugetlb_lock); + } +} diff --git a/mm/migrate.c b/mm/migrate.c index 4d0be47a322a..1e5525a25691 100644 --- a/mm/migrate.c +++ b/mm/migrate.c @@ -1323,9 +1323,8 @@ put_anon: put_anon_vma(anon_vma); if (rc == MIGRATEPAGE_SUCCESS) { - hugetlb_cgroup_migrate(hpage, new_hpage); + move_hugetlb_state(hpage, new_hpage, reason); put_new_page = NULL; - set_page_owner_migrate_reason(new_hpage, reason); } unlock_page(hpage); From 9980d744a04281c65a8849c437c8ab9fec2db17b Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:52 -0800 Subject: [PATCH 109/118] mm, hugetlb: get rid of surplus page accounting tricks alloc_surplus_huge_page increases the pool size and the number of surplus pages opportunistically to prevent from races with the pool size change. See commit d1c3fb1f8f29 ("hugetlb: introduce nr_overcommit_hugepages sysctl") for more details. The resulting code is unnecessarily hairy, cause code duplication and doesn't allow to share the allocation paths. Moreover pool size changes tend to be very seldom so optimizing for them is not really reasonable. Simplify the code and allow to allocate a fresh surplus page as long as we are under the overcommit limit and then recheck the condition after the allocation and drop the new page if the situation has changed. This should provide a reasonable guarantee that an abrupt allocation requests will not go way off the limit. If we consider races with the pool shrinking and enlarging then we should be reasonably safe as well. In the first case we are off by one in the worst case and the second case should work OK because the page is not yet visible. We can waste CPU cycles for the allocation but that should be acceptable for a relatively rare condition. Link: http://lkml.kernel.org/r/20180103093213.26329-5-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 62 +++++++++++++++++++--------------------------------- 1 file changed, 23 insertions(+), 39 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index f260ffa26363..7dc80cbe8e89 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1540,62 +1540,46 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { - struct page *page; - unsigned int r_nid; + struct page *page = NULL; if (hstate_is_gigantic(h)) return NULL; - /* - * Assume we will successfully allocate the surplus page to - * prevent racing processes from causing the surplus to exceed - * overcommit - * - * This however introduces a different race, where a process B - * tries to grow the static hugepage pool while alloc_pages() is - * called by process A. B will only examine the per-node - * counters in determining if surplus huge pages can be - * converted to normal huge pages in adjust_pool_surplus(). A - * won't be able to increment the per-node counter, until the - * lock is dropped by B, but B doesn't drop hugetlb_lock until - * no more huge pages can be converted from surplus to normal - * state (and doesn't try to convert again). Thus, we have a - * case where a surplus huge page exists, the pool is grown, and - * the surplus huge page still exists after, even though it - * should just have been converted to a normal huge page. This - * does not leak memory, though, as the hugepage will be freed - * once it is out of use. It also does not allow the counters to - * go out of whack in adjust_pool_surplus() as we don't modify - * the node values until we've gotten the hugepage and only the - * per-node value is checked there. - */ spin_lock(&hugetlb_lock); - if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { - spin_unlock(&hugetlb_lock); - return NULL; - } else { - h->nr_huge_pages++; - h->surplus_huge_pages++; - } + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) + goto out_unlock; spin_unlock(&hugetlb_lock); page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + if (!page) + goto out_unlock; spin_lock(&hugetlb_lock); - if (page) { + /* + * We could have raced with the pool size change. + * Double check that and simply deallocate the new page + * if we would end up overcommiting the surpluses. Abuse + * temporary page to workaround the nasty free_huge_page + * codeflow + */ + if (h->surplus_huge_pages >= h->nr_overcommit_huge_pages) { + SetPageHugeTemporary(page); + put_page(page); + page = NULL; + } else { + int r_nid; + + h->surplus_huge_pages++; + h->nr_huge_pages++; INIT_LIST_HEAD(&page->lru); r_nid = page_to_nid(page); set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); set_hugetlb_cgroup(page, NULL); - /* - * We incremented the global counters already - */ h->nr_huge_pages_node[r_nid]++; h->surplus_huge_pages_node[r_nid]++; - } else { - h->nr_huge_pages--; - h->surplus_huge_pages--; } + +out_unlock: spin_unlock(&hugetlb_lock); return page; From 0c397daea1d456f304e00413ee9e90a1830868a5 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:20:56 -0800 Subject: [PATCH 110/118] mm, hugetlb: further simplify hugetlb allocation API Hugetlb allocator has several layer of allocation functions depending and the purpose of the allocation. There are two allocators depending on whether the page can be allocated from the page allocator or we need a contiguous allocator. This is currently opencoded in alloc_fresh_huge_page which is the only path that might allocate giga pages which require the later allocator. Create alloc_fresh_huge_page which hides this implementation detail and use it in all callers which hardcoded the buddy allocator path (__hugetlb_alloc_buddy_huge_page). This shouldn't introduce any funtional change because both migration and surplus allocators exlude giga pages explicitly. While we are at it let's do some renaming. The current scheme is not consistent and overly painfull to read and understand. Get rid of prefix underscores from most functions. There is no real reason to make names longer. * alloc_fresh_huge_page is the new layer to abstract underlying allocator * __hugetlb_alloc_buddy_huge_page becomes shorter and neater alloc_buddy_huge_page. * Former alloc_fresh_huge_page becomes alloc_pool_huge_page because we put the new page directly to the pool * alloc_surplus_huge_page can drop the opencoded prep_new_huge_page code as it uses alloc_fresh_huge_page now * others lose their excessive prefix underscores to make names shorter [dan.carpenter@oracle.com: fix double unlock bug in alloc_surplus_huge_page()] Link: http://lkml.kernel.org/r/20180109200559.g3iz5kvbdrz7yydp@mwanda Link: http://lkml.kernel.org/r/20180103093213.26329-6-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Dan Carpenter Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 80 ++++++++++++++++++++++++++++------------------------ 1 file changed, 43 insertions(+), 37 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 7dc80cbe8e89..b55886af82aa 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1378,7 +1378,7 @@ pgoff_t __basepage_index(struct page *page) return (index << compound_order(page_head)) + compound_idx; } -static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, +static struct page *alloc_buddy_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { int order = huge_page_order(h); @@ -1396,34 +1396,49 @@ static struct page *__hugetlb_alloc_buddy_huge_page(struct hstate *h, return page; } +/* + * Common helper to allocate a fresh hugetlb page. All specific allocators + * should use this function to get new hugetlb pages + */ +static struct page *alloc_fresh_huge_page(struct hstate *h, + gfp_t gfp_mask, int nid, nodemask_t *nmask) +{ + struct page *page; + + if (hstate_is_gigantic(h)) + page = alloc_gigantic_page(h, gfp_mask, nid, nmask); + else + page = alloc_buddy_huge_page(h, gfp_mask, + nid, nmask); + if (!page) + return NULL; + + if (hstate_is_gigantic(h)) + prep_compound_gigantic_page(page, huge_page_order(h)); + prep_new_huge_page(h, page, page_to_nid(page)); + + return page; +} + /* * Allocates a fresh page to the hugetlb allocator pool in the node interleaved * manner. */ -static int alloc_fresh_huge_page(struct hstate *h, nodemask_t *nodes_allowed) +static int alloc_pool_huge_page(struct hstate *h, nodemask_t *nodes_allowed) { struct page *page; int nr_nodes, node; gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; for_each_node_mask_to_alloc(h, nr_nodes, node, nodes_allowed) { - if (hstate_is_gigantic(h)) - page = alloc_gigantic_page(h, gfp_mask, - node, nodes_allowed); - else - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, - node, nodes_allowed); + page = alloc_fresh_huge_page(h, gfp_mask, node, nodes_allowed); if (page) break; - } if (!page) return 0; - if (hstate_is_gigantic(h)) - prep_compound_gigantic_page(page, huge_page_order(h)); - prep_new_huge_page(h, page, page_to_nid(page)); put_page(page); /* free it into the hugepage allocator */ return 1; @@ -1537,7 +1552,7 @@ int dissolve_free_huge_pages(unsigned long start_pfn, unsigned long end_pfn) /* * Allocates a fresh surplus page from the page allocator. */ -static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct page *alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page = NULL; @@ -1550,9 +1565,9 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, goto out_unlock; spin_unlock(&hugetlb_lock); - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); if (!page) - goto out_unlock; + return NULL; spin_lock(&hugetlb_lock); /* @@ -1567,16 +1582,8 @@ static struct page *__alloc_surplus_huge_page(struct hstate *h, gfp_t gfp_mask, put_page(page); page = NULL; } else { - int r_nid; - h->surplus_huge_pages++; - h->nr_huge_pages++; - INIT_LIST_HEAD(&page->lru); - r_nid = page_to_nid(page); - set_compound_page_dtor(page, HUGETLB_PAGE_DTOR); - set_hugetlb_cgroup(page, NULL); - h->nr_huge_pages_node[r_nid]++; - h->surplus_huge_pages_node[r_nid]++; + h->nr_huge_pages_node[page_to_nid(page)]++; } out_unlock: @@ -1585,7 +1592,7 @@ out_unlock: return page; } -static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, +static struct page *alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, int nid, nodemask_t *nmask) { struct page *page; @@ -1593,7 +1600,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, if (hstate_is_gigantic(h)) return NULL; - page = __hugetlb_alloc_buddy_huge_page(h, gfp_mask, nid, nmask); + page = alloc_fresh_huge_page(h, gfp_mask, nid, nmask); if (!page) return NULL; @@ -1601,7 +1608,6 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * We do not account these pages as surplus because they are only * temporary and will be released properly on the last reference */ - prep_new_huge_page(h, page, page_to_nid(page)); SetPageHugeTemporary(page); return page; @@ -1611,7 +1617,7 @@ static struct page *__alloc_migrate_huge_page(struct hstate *h, gfp_t gfp_mask, * Use the VMA's mpolicy to allocate a huge page from the buddy. */ static -struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, +struct page *alloc_buddy_huge_page_with_mpol(struct hstate *h, struct vm_area_struct *vma, unsigned long addr) { struct page *page; @@ -1621,7 +1627,7 @@ struct page *__alloc_buddy_huge_page_with_mpol(struct hstate *h, nodemask_t *nodemask; nid = huge_node(vma, addr, gfp_mask, &mpol, &nodemask); - page = __alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); + page = alloc_surplus_huge_page(h, gfp_mask, nid, nodemask); mpol_cond_put(mpol); return page; @@ -1642,7 +1648,7 @@ struct page *alloc_huge_page_node(struct hstate *h, int nid) spin_unlock(&hugetlb_lock); if (!page) - page = __alloc_migrate_huge_page(h, gfp_mask, nid, NULL); + page = alloc_migrate_huge_page(h, gfp_mask, nid, NULL); return page; } @@ -1665,7 +1671,7 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } spin_unlock(&hugetlb_lock); - return __alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); + return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } /* @@ -1693,7 +1699,7 @@ static int gather_surplus_pages(struct hstate *h, int delta) retry: spin_unlock(&hugetlb_lock); for (i = 0; i < needed; i++) { - page = __alloc_surplus_huge_page(h, htlb_alloc_mask(h), + page = alloc_surplus_huge_page(h, htlb_alloc_mask(h), NUMA_NO_NODE, NULL); if (!page) { alloc_ok = false; @@ -2030,7 +2036,7 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, page = dequeue_huge_page_vma(h, vma, addr, avoid_reserve, gbl_chg); if (!page) { spin_unlock(&hugetlb_lock); - page = __alloc_buddy_huge_page_with_mpol(h, vma, addr); + page = alloc_buddy_huge_page_with_mpol(h, vma, addr); if (!page) goto out_uncharge_cgroup; if (!avoid_reserve && vma_has_reserves(vma, gbl_chg)) { @@ -2170,7 +2176,7 @@ static void __init hugetlb_hstate_alloc_pages(struct hstate *h) if (hstate_is_gigantic(h)) { if (!alloc_bootmem_huge_page(h)) break; - } else if (!alloc_fresh_huge_page(h, + } else if (!alloc_pool_huge_page(h, &node_states[N_MEMORY])) break; cond_resched(); @@ -2290,7 +2296,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * First take pages out of surplus state. Then make up the * remaining difference by allocating fresh huge pages. * - * We might race with __alloc_surplus_huge_page() here and be unable + * We might race with alloc_surplus_huge_page() here and be unable * to convert a surplus huge page to a normal huge page. That is * not critical, though, it just means the overall size of the * pool might be one hugepage larger than it needs to be, but @@ -2313,7 +2319,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, /* yield cpu to avoid soft lockup */ cond_resched(); - ret = alloc_fresh_huge_page(h, nodes_allowed); + ret = alloc_pool_huge_page(h, nodes_allowed); spin_lock(&hugetlb_lock); if (!ret) goto out; @@ -2333,7 +2339,7 @@ static unsigned long set_max_huge_pages(struct hstate *h, unsigned long count, * By placing pages into the surplus state independent of the * overcommit value, we are allowing the surplus pool size to * exceed overcommit. There are few sane options here. Since - * __alloc_surplus_huge_page() is checking the global counter, + * alloc_surplus_huge_page() is checking the global counter, * though, we'll note that we're not allowed to exceed surplus * and won't grow the pool anywhere else. Not until one of the * sysctls are changed, or the surplus pages go out of use. From ebd637235890a3fa6a6d4bb57522098f2f59c693 Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:00 -0800 Subject: [PATCH 111/118] hugetlb, mempolicy: fix the mbind hugetlb migration do_mbind migration code relies on alloc_huge_page_noerr for hugetlb pages. alloc_huge_page_noerr uses alloc_huge_page which is a highlevel allocation function which has to take care of reserves, overcommit or hugetlb cgroup accounting. None of that is really required for the page migration because the new page is only temporal and either will replace the original page or it will be dropped. This is essentially as for other migration call paths and there shouldn't be any reason to handle mbind in a special way. The current implementation is even suboptimal because the migration might fail just because the hugetlb cgroup limit is reached, or the overcommit is saturated. Fix this by making mbind like other hugetlb migration paths. Add a new migration helper alloc_huge_page_vma as a wrapper around alloc_huge_page_nodemask with additional mempolicy handling. alloc_huge_page_noerr has no more users and it can go. Link: http://lkml.kernel.org/r/20180103093213.26329-7-mhocko@kernel.org Signed-off-by: Michal Hocko Reviewed-by: Mike Kravetz Reviewed-by: Naoya Horiguchi Cc: Andrea Reale Cc: Anshuman Khandual Cc: Kirill A. Shutemov Cc: Vlastimil Babka Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 ++--- mm/hugetlb.c | 33 +++++++++++++++++++-------------- mm/mempolicy.c | 3 +-- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 66992348531e..612a29b7f6c6 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -356,10 +356,9 @@ struct huge_bootmem_page { struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_node(struct hstate *h, int nid); -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -537,7 +536,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_noerr(v, a, r) NULL +#define alloc_huge_page_vma(vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index b55886af82aa..742a929f2311 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1674,6 +1674,25 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, return alloc_migrate_huge_page(h, gfp_mask, preferred_nid, nmask); } +/* mempolicy aware migration callback */ +struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address) +{ + struct mempolicy *mpol; + nodemask_t *nodemask; + struct page *page; + struct hstate *h; + gfp_t gfp_mask; + int node; + + h = hstate_vma(vma); + gfp_mask = htlb_alloc_mask(h); + node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); + page = alloc_huge_page_nodemask(h, node, nodemask); + mpol_cond_put(mpol); + + return page; +} + /* * Increase the hugetlb pool such that it can accommodate a reservation * of size 'delta'. @@ -2079,20 +2098,6 @@ out_subpool_put: return ERR_PTR(-ENOSPC); } -/* - * alloc_huge_page()'s wrapper which simply returns the page if allocation - * succeeds, otherwise NULL. This function is called from new_vma_page(), - * where no ERR_VALUE is expected to be returned. - */ -struct page *alloc_huge_page_noerr(struct vm_area_struct *vma, - unsigned long addr, int avoid_reserve) -{ - struct page *page = alloc_huge_page(vma, addr, avoid_reserve); - if (IS_ERR(page)) - page = NULL; - return page; -} - int alloc_bootmem_huge_page(struct hstate *h) __attribute__ ((weak, alias("__alloc_bootmem_huge_page"))); int __alloc_bootmem_huge_page(struct hstate *h) diff --git a/mm/mempolicy.c b/mm/mempolicy.c index f604b22ebb65..96823fa07f38 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1121,8 +1121,7 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) } if (PageHuge(page)) { - BUG_ON(!vma); - return alloc_huge_page_noerr(vma, address, 1); + return alloc_huge_page_vma(vma, address); } else if (thp_migration_supported() && PageTransHuge(page)) { struct page *thp; From 389c8178d0904f944887ccca2256ff9d79c12e8e Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:03 -0800 Subject: [PATCH 112/118] hugetlb, mbind: fall back to default policy if vma is NULL Dan Carpenter has noticed that mbind migration callback (new_page) can get a NULL vma pointer and choke on it inside alloc_huge_page_vma which relies on the VMA to get the hstate. We used to BUG_ON this case but the BUG_+ON has been removed recently by "hugetlb, mempolicy: fix the mbind hugetlb migration". The proper way to handle this is to get the hstate from the migrated page and rely on huge_node (resp. get_vma_policy) do the right thing with null VMA. We are currently falling back to the default mempolicy in that case which is in line what THP path is doing here. Link: http://lkml.kernel.org/r/20180110104712.GR1732@dhcp22.suse.cz Signed-off-by: Michal Hocko Reported-by: Dan Carpenter Cc: Naoya Horiguchi Cc: Mike Kravetz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hugetlb.h | 5 +++-- mm/hugetlb.c | 5 ++--- mm/mempolicy.c | 3 ++- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/include/linux/hugetlb.h b/include/linux/hugetlb.h index 612a29b7f6c6..36fa6a2a82e3 100644 --- a/include/linux/hugetlb.h +++ b/include/linux/hugetlb.h @@ -358,7 +358,8 @@ struct page *alloc_huge_page(struct vm_area_struct *vma, struct page *alloc_huge_page_node(struct hstate *h, int nid); struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, nodemask_t *nmask); -struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address); +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address); int huge_add_to_page_cache(struct page *page, struct address_space *mapping, pgoff_t idx); @@ -536,7 +537,7 @@ struct hstate {}; #define alloc_huge_page(v, a, r) NULL #define alloc_huge_page_node(h, nid) NULL #define alloc_huge_page_nodemask(h, preferred_nid, nmask) NULL -#define alloc_huge_page_vma(vma, address) NULL +#define alloc_huge_page_vma(h, vma, address) NULL #define alloc_bootmem_huge_page(h) NULL #define hstate_file(f) NULL #define hstate_sizelog(s) NULL diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 742a929f2311..7c204e3d132b 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -1675,16 +1675,15 @@ struct page *alloc_huge_page_nodemask(struct hstate *h, int preferred_nid, } /* mempolicy aware migration callback */ -struct page *alloc_huge_page_vma(struct vm_area_struct *vma, unsigned long address) +struct page *alloc_huge_page_vma(struct hstate *h, struct vm_area_struct *vma, + unsigned long address) { struct mempolicy *mpol; nodemask_t *nodemask; struct page *page; - struct hstate *h; gfp_t gfp_mask; int node; - h = hstate_vma(vma); gfp_mask = htlb_alloc_mask(h); node = huge_node(vma, address, gfp_mask, &mpol, &nodemask); page = alloc_huge_page_nodemask(h, node, nodemask); diff --git a/mm/mempolicy.c b/mm/mempolicy.c index 96823fa07f38..d879f1d8a44a 100644 --- a/mm/mempolicy.c +++ b/mm/mempolicy.c @@ -1121,7 +1121,8 @@ static struct page *new_page(struct page *page, unsigned long start, int **x) } if (PageHuge(page)) { - return alloc_huge_page_vma(vma, address); + return alloc_huge_page_vma(page_hstate(compound_head(page)), + vma, address); } else if (thp_migration_supported() && PageTransHuge(page)) { struct page *thp; From 859d4adc3415a64ccb8b0c50dc4e3a888dcb5805 Mon Sep 17 00:00:00 2001 From: Henry Willard Date: Wed, 31 Jan 2018 16:21:07 -0800 Subject: [PATCH 113/118] mm: numa: do not trap faults on shared data section pages. MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Workloads consisting of a large number of processes running the same program with a very large shared data segment may experience performance problems when numa balancing attempts to migrate the shared cow pages. This manifests itself with many processes or tasks in TASK_UNINTERRUPTIBLE state waiting for the shared pages to be migrated. The program listed below simulates the conditions with these results when run with 288 processes on a 144 core/8 socket machine. Average throughput Average throughput Average throughput with numa_balancing=0 with numa_balancing=1 with numa_balancing=1 without the patch with the patch --------------------- --------------------- --------------------- 2118782 2021534 2107979 Complex production environments show less variability and fewer poorly performing outliers accompanied with a smaller number of processes waiting on NUMA page migration with this patch applied. In some cases, %iowait drops from 16%-26% to 0. // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. */ #include #include #include #include int a[1000000] = {13}; int main(int argc, const char **argv) { int n = 0; int i; pid_t pid; int stat; int *count_array; int cpu_count = 288; long total = 0; struct timeval t1, t2 = {(argc > 1 ? atoi(argv[1]) : 10), 0}; if (argc > 2) cpu_count = atoi(argv[2]); count_array = mmap(NULL, cpu_count * sizeof(int), (PROT_READ|PROT_WRITE), (MAP_SHARED|MAP_ANONYMOUS), 0, 0); if (count_array == MAP_FAILED) { perror("mmap:"); return 0; } for (i = 0; i < cpu_count; ++i) { pid = fork(); if (pid <= 0) break; if ((i & 0xf) == 0) usleep(2); } if (pid != 0) { if (i == 0) { perror("fork:"); return 0; } for (;;) { pid = wait(&stat); if (pid < 0) break; } for (i = 0; i < cpu_count; ++i) total += count_array[i]; printf("Total %ld\n", total); munmap(count_array, cpu_count * sizeof(int)); return 0; } gettimeofday(&t1, 0); timeradd(&t1, &t2, &t1); while (timercmp(&t2, &t1, <)) { int b = 0; int j; for (j = 0; j < 1000000; j++) b += a[j]; gettimeofday(&t2, 0); n++; } count_array[i] = n; return 0; } This patch changes change_pte_range() to skip shared copy-on-write pages when called from change_prot_numa(). NOTE: change_prot_numa() is nominally called from task_numa_work() and queue_pages_test_walk(). task_numa_work() is the auto NUMA balancing path, and queue_pages_test_walk() is part of explicit NUMA policy management. However, queue_pages_test_walk() only calls change_prot_numa() when MPOL_MF_LAZY is specified and currently that is not allowed, so change_prot_numa() is only called from auto NUMA balancing. In the case of explicit NUMA policy management, shared pages are not migrated unless MPOL_MF_MOVE_ALL is specified, and MPOL_MF_MOVE_ALL depends on CAP_SYS_NICE. Currently, there is no way to pass information about MPOL_MF_MOVE_ALL to change_pte_range. This will have to be fixed if MPOL_MF_LAZY is enabled and MPOL_MF_MOVE_ALL is to be honored in lazy migration mode. task_numa_work() skips the read-only VMAs of programs and shared libraries. Link: http://lkml.kernel.org/r/1516751617-7369-1-git-send-email-henry.willard@oracle.com Signed-off-by: Henry Willard Reviewed-by: Håkon Bugge Reviewed-by: Steve Sistare Acked-by: Mel Gorman Cc: Kate Stewart Cc: Zi Yan Cc: Philippe Ombredanne Cc: Andrea Arcangeli Cc: Greg Kroah-Hartman Cc: Aneesh Kumar K.V Cc: Kirill A. Shutemov Cc: "Jérôme Glisse" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/mprotect.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/mprotect.c b/mm/mprotect.c index 58b629bb70de..e3309fcf586b 100644 --- a/mm/mprotect.c +++ b/mm/mprotect.c @@ -84,6 +84,11 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd, if (!page || PageKsm(page)) continue; + /* Also skip shared copy-on-write pages */ + if (is_cow_mapping(vma->vm_flags) && + page_mapcount(page) != 1) + continue; + /* Avoid TLB flush if possible */ if (pte_protnone(oldpte)) continue; From da391d640c528bc5bb227ea5b39c882b75ac3167 Mon Sep 17 00:00:00 2001 From: William Kucharski Date: Wed, 31 Jan 2018 16:21:11 -0800 Subject: [PATCH 114/118] mm: correct comments regarding do_fault_around() There are multiple comments surrounding do_fault_around that memtion fault_around_pages() and fault_around_mask(), two routines that do not exist. These comments should be reworded to reference fault_around_bytes, the value which is used to determine how much do_fault_around() will attempt to read when processing a fault. These comments should have been updated when fault_around_pages() and fault_around_mask() were removed in commit aecd6f44266c ("mm: close race between do_fault_around() and fault_around_bytes_set()"). Fixes: aecd6f44266c1 ("mm: close race between do_fault_around() and fault_around_bytes_set()") Link: http://lkml.kernel.org/r/302D0B14-C7E9-44C6-8BED-033F9ACBD030@oracle.com Signed-off-by: William Kucharski Reviewed-by: Larry Bassel Cc: Michal Hocko Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index a6e5d6ac5d24..53373b7a1512 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -3511,9 +3511,8 @@ static int fault_around_bytes_get(void *data, u64 *val) } /* - * fault_around_pages() and fault_around_mask() expects fault_around_bytes - * rounded down to nearest page order. It's what do_fault_around() expects to - * see. + * fault_around_bytes must be rounded down to the nearest page order as it's + * what do_fault_around() expects to see. */ static int fault_around_bytes_set(void *data, u64 val) { @@ -3556,13 +3555,14 @@ late_initcall(fault_around_debugfs); * This function doesn't cross the VMA boundaries, in order to call map_pages() * only once. * - * fault_around_pages() defines how many pages we'll try to map. - * do_fault_around() expects it to return a power of two less than or equal to - * PTRS_PER_PTE. + * fault_around_bytes defines how many bytes we'll try to map. + * do_fault_around() expects it to be set to a power of two less than or equal + * to PTRS_PER_PTE. * - * The virtual address of the area that we map is naturally aligned to the - * fault_around_pages() value (and therefore to page order). This way it's - * easier to guarantee that we don't cross page table boundaries. + * The virtual address of the area that we map is naturally aligned to + * fault_around_bytes rounded down to the machine page size + * (and therefore to page order). This way it's easier to guarantee + * that we don't cross page table boundaries. */ static int do_fault_around(struct vm_fault *vmf) { @@ -3579,8 +3579,8 @@ static int do_fault_around(struct vm_fault *vmf) start_pgoff -= off; /* - * end_pgoff is either end of page table or end of vma - * or fault_around_pages() from start_pgoff, depending what is nearest. + * end_pgoff is either the end of the page table, the end of + * the vma or nr_pages from start_pgoff, depending what is nearest. */ end_pgoff = start_pgoff - ((vmf->address >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) + From 9bb5a391f9a5707e04763cf14298fc4cc29bfecd Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Wed, 31 Jan 2018 16:21:14 -0800 Subject: [PATCH 115/118] mm, memory_hotplug: fix memmap initialization Bharata has noticed that onlining a newly added memory doesn't increase the total memory, pointing to commit f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") as a culprit. This commit has changed the way how the memory for memmaps is initialized and moves it from the allocation time to the initialization time. This works properly for the early memmap init path. It doesn't work for the memory hotplug though because we need to mark page as reserved when the sparsemem section is created and later initialize it completely during onlining. memmap_init_zone is called in the early stage of onlining. With the current code it calls __init_single_page and as such it clears up the whole stage and therefore online_pages_range skips those pages. Fix this by skipping mm_zero_struct_page in __init_single_page for memory hotplug path. This is quite uggly but unifying both early init and memory hotplug init paths is a large project. Make sure we plug the regression at least. Link: http://lkml.kernel.org/r/20180130101141.GW21609@dhcp22.suse.cz Fixes: f7f99100d8d9 ("mm: stop zeroing memory during allocation in vmemmap") Signed-off-by: Michal Hocko Reported-by: Bharata B Rao Tested-by: Bharata B Rao Reviewed-by: Pavel Tatashin Cc: Steven Sistare Cc: Daniel Jordan Cc: Bob Picco Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 22 ++++++++++++++-------- 1 file changed, 14 insertions(+), 8 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a6972750e7c5..c7dd9c86e353 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -1177,9 +1177,10 @@ static void free_one_page(struct zone *zone, } static void __meminit __init_single_page(struct page *page, unsigned long pfn, - unsigned long zone, int nid) + unsigned long zone, int nid, bool zero) { - mm_zero_struct_page(page); + if (zero) + mm_zero_struct_page(page); set_page_links(page, zone, nid, pfn); init_page_count(page); page_mapcount_reset(page); @@ -1194,9 +1195,9 @@ static void __meminit __init_single_page(struct page *page, unsigned long pfn, } static void __meminit __init_single_pfn(unsigned long pfn, unsigned long zone, - int nid) + int nid, bool zero) { - return __init_single_page(pfn_to_page(pfn), pfn, zone, nid); + return __init_single_page(pfn_to_page(pfn), pfn, zone, nid, zero); } #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -1217,7 +1218,7 @@ static void __meminit init_reserved_page(unsigned long pfn) if (pfn >= zone->zone_start_pfn && pfn < zone_end_pfn(zone)) break; } - __init_single_pfn(pfn, zid, nid); + __init_single_pfn(pfn, zid, nid, true); } #else static inline void init_reserved_page(unsigned long pfn) @@ -1534,7 +1535,7 @@ static unsigned long __init deferred_init_pages(int nid, int zid, } else { page++; } - __init_single_page(page, pfn, zid, nid); + __init_single_page(page, pfn, zid, nid, true); nr_pages++; } return (nr_pages); @@ -5399,15 +5400,20 @@ not_early: * can be created for invalid pages (for alignment) * check here not to call set_pageblock_migratetype() against * pfn out of zone. + * + * Please note that MEMMAP_HOTPLUG path doesn't clear memmap + * because this is done early in sparse_add_one_section */ if (!(pfn & (pageblock_nr_pages - 1))) { struct page *page = pfn_to_page(pfn); - __init_single_page(page, pfn, zone, nid); + __init_single_page(page, pfn, zone, nid, + context != MEMMAP_HOTPLUG); set_pageblock_migratetype(page, MIGRATE_MOVABLE); cond_resched(); } else { - __init_single_pfn(pfn, zone, nid); + __init_single_pfn(pfn, zone, nid, + context != MEMMAP_HOTPLUG); } } } From e02a9f048ef79a411904bef075fd3ce4204052a9 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Wed, 31 Jan 2018 16:21:19 -0800 Subject: [PATCH 116/118] mm/swap.c: make functions and their kernel-doc agree Fix some basic kernel-doc notation in mm/swap.c: - for function lru_cache_add_anon(), make its kernel-doc function name match its function name and change colon to hyphen following the function name - for function pagevec_lookup_entries(), change the function parameter name from nr_pages to nr_entries since that is more descriptive of what the parameter actually is and then it matches the kernel-doc comments also Fix function kernel-doc to match the change in commit 67fd707f4681: - drop the kernel-doc notation for @nr_pages from pagevec_lookup_range() and correct the function description for that change Link: http://lkml.kernel.org/r/3b42ee3e-04a9-a6ca-6be4-f00752a114fe@infradead.org Fixes: 67fd707f4681 ("mm: remove nr_pages argument from pagevec_lookup_{,range}_tag()") Signed-off-by: Randy Dunlap Reviewed-by: Andrew Morton Cc: Jan Kara Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/swap.c | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/mm/swap.c b/mm/swap.c index e824c800adca..10568b1548d4 100644 --- a/mm/swap.c +++ b/mm/swap.c @@ -411,7 +411,7 @@ static void __lru_cache_add(struct page *page) } /** - * lru_cache_add: add a page to the page lists + * lru_cache_add_anon - add a page to the page lists * @page: the page to add */ void lru_cache_add_anon(struct page *page) @@ -930,10 +930,10 @@ EXPORT_SYMBOL(__pagevec_lru_add); */ unsigned pagevec_lookup_entries(struct pagevec *pvec, struct address_space *mapping, - pgoff_t start, unsigned nr_pages, + pgoff_t start, unsigned nr_entries, pgoff_t *indices) { - pvec->nr = find_get_entries(mapping, start, nr_pages, + pvec->nr = find_get_entries(mapping, start, nr_entries, pvec->pages, indices); return pagevec_count(pvec); } @@ -965,9 +965,8 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * @mapping: The address_space to search * @start: The starting page index * @end: The final page index - * @nr_pages: The maximum number of pages * - * pagevec_lookup_range() will search for and return a group of up to @nr_pages + * pagevec_lookup_range() will search for & return a group of up to PAGEVEC_SIZE * pages in the mapping starting from index @start and upto index @end * (inclusive). The pages are placed in @pvec. pagevec_lookup() takes a * reference against the pages in @pvec. @@ -977,7 +976,7 @@ void pagevec_remove_exceptionals(struct pagevec *pvec) * also update @start to index the next page for the traversal. * * pagevec_lookup_range() returns the number of pages which were found. If this - * number is smaller than @nr_pages, the end of specified range has been + * number is smaller than PAGEVEC_SIZE, the end of specified range has been * reached. */ unsigned pagevec_lookup_range(struct pagevec *pvec, From c7905f200225d4257536f19b11d18f598fee5f44 Mon Sep 17 00:00:00 2001 From: David Rientjes Date: Wed, 31 Jan 2018 16:21:23 -0800 Subject: [PATCH 117/118] tools, vm: new option to specify kpageflags file page-types currently hardcodes /proc/kpageflags as the file to parse. This works when using the tool to examine the state of pageflags on the same system, but does not allow storing a snapshot of pageflags at a given time to debug issues nor on a different system. This allows the user to specify a saved version of kpageflags with a new page-types -F option. [akpm@linux-foundation.org: add "filename" to fix usage() string] [rientjes@google.com: fix layout] Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1801301840050.140969@chino.kir.corp.google.com Link: http://lkml.kernel.org/r/alpine.DEB.2.10.1801301458180.153857@chino.kir.corp.google.com Signed-off-by: David Rientjes Reviewed-by: Andrew Morton Reviewed-by: Naoya Horiguchi Cc: Konstantin Khlebnikov Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/vm/page-types.c | 28 +++++++++++++++++++++------- 1 file changed, 21 insertions(+), 7 deletions(-) diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index e92903fc7113..a8783f48f77f 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -169,9 +169,10 @@ static int opt_raw; /* for kernel developers */ static int opt_list; /* list pages (in ranges) */ static int opt_no_summary; /* don't show summary */ static pid_t opt_pid; /* process to walk */ -const char * opt_file; /* file or directory path */ +const char *opt_file; /* file or directory path */ static uint64_t opt_cgroup; /* cgroup inode */ static int opt_list_cgroup;/* list page cgroup */ +static const char *opt_kpageflags;/* kpageflags file to parse */ #define MAX_ADDR_RANGES 1024 static int nr_addr_ranges; @@ -258,7 +259,7 @@ static int checked_open(const char *pathname, int flags) * pagemap/kpageflags routines */ -static unsigned long do_u64_read(int fd, char *name, +static unsigned long do_u64_read(int fd, const char *name, uint64_t *buf, unsigned long index, unsigned long count) @@ -283,7 +284,7 @@ static unsigned long kpageflags_read(uint64_t *buf, unsigned long index, unsigned long pages) { - return do_u64_read(kpageflags_fd, PROC_KPAGEFLAGS, buf, index, pages); + return do_u64_read(kpageflags_fd, opt_kpageflags, buf, index, pages); } static unsigned long kpagecgroup_read(uint64_t *buf, @@ -293,7 +294,7 @@ static unsigned long kpagecgroup_read(uint64_t *buf, if (kpagecgroup_fd < 0) return pages; - return do_u64_read(kpagecgroup_fd, PROC_KPAGEFLAGS, buf, index, pages); + return do_u64_read(kpagecgroup_fd, opt_kpageflags, buf, index, pages); } static unsigned long pagemap_read(uint64_t *buf, @@ -743,7 +744,7 @@ static void walk_addr_ranges(void) { int i; - kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); if (!nr_addr_ranges) add_addr_range(0, ULONG_MAX); @@ -790,6 +791,7 @@ static void usage(void) " -N|--no-summary Don't show summary info\n" " -X|--hwpoison hwpoison pages\n" " -x|--unpoison unpoison pages\n" +" -F|--kpageflags filename kpageflags file to parse\n" " -h|--help Show this usage message\n" "flags:\n" " 0x10 bitfield format, e.g.\n" @@ -1013,7 +1015,7 @@ static void walk_page_cache(void) { struct stat st; - kpageflags_fd = checked_open(PROC_KPAGEFLAGS, O_RDONLY); + kpageflags_fd = checked_open(opt_kpageflags, O_RDONLY); pagemap_fd = checked_open("/proc/self/pagemap", O_RDONLY); sigaction(SIGBUS, &sigbus_action, NULL); @@ -1164,6 +1166,11 @@ static void parse_bits_mask(const char *optarg) add_bits_filter(mask, bits); } +static void parse_kpageflags(const char *name) +{ + opt_kpageflags = name; +} + static void describe_flags(const char *optarg) { uint64_t flags = parse_flag_names(optarg, 0); @@ -1188,6 +1195,7 @@ static const struct option opts[] = { { "no-summary", 0, NULL, 'N' }, { "hwpoison" , 0, NULL, 'X' }, { "unpoison" , 0, NULL, 'x' }, + { "kpageflags", 0, NULL, 'F' }, { "help" , 0, NULL, 'h' }, { NULL , 0, NULL, 0 } }; @@ -1199,7 +1207,7 @@ int main(int argc, char *argv[]) page_size = getpagesize(); while ((c = getopt_long(argc, argv, - "rp:f:a:b:d:c:ClLNXxh", opts, NULL)) != -1) { + "rp:f:a:b:d:c:ClLNXxF:h", opts, NULL)) != -1) { switch (c) { case 'r': opt_raw = 1; @@ -1242,6 +1250,9 @@ int main(int argc, char *argv[]) opt_unpoison = 1; prepare_hwpoison_fd(); break; + case 'F': + parse_kpageflags(optarg); + break; case 'h': usage(); exit(0); @@ -1251,6 +1262,9 @@ int main(int argc, char *argv[]) } } + if (!opt_kpageflags) + opt_kpageflags = PROC_KPAGEFLAGS; + if (opt_cgroup || opt_list_cgroup) kpagecgroup_fd = checked_open(PROC_KPAGECGROUP, O_RDONLY); From 3f56a2f8030071cf86520ef4fc3045ba6856e610 Mon Sep 17 00:00:00 2001 From: Miles Chen Date: Wed, 31 Jan 2018 16:21:27 -0800 Subject: [PATCH 118/118] mm: remove PG_highmem description Commit cbe37d093707 ("[PATCH] mm: remove PG_highmem") removed PG_highmem to save a page flag. So the description of PG_highmem is no longer needed. Link: http://lkml.kernel.org/r/1517391212-2950-1-git-send-email-miles.chen@mediatek.com Signed-off-by: Miles Chen Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/page-flags.h | 5 ----- 1 file changed, 5 deletions(-) diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 3ec44e27aa9d..50c2b8786831 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -46,11 +46,6 @@ * guarantees that this bit is cleared for a page when it first is entered into * the page cache. * - * PG_highmem pages are not permanently mapped into the kernel virtual address - * space, they need to be kmapped separately for doing IO on the pages. The - * struct page (these bits with information) are always mapped into kernel - * address space... - * * PG_hwpoison indicates that a page got corrupted in hardware and contains * data with incorrect ECC bits that triggered a machine check. Accessing is * not safe since it may cause another machine check. Don't touch!